diff options
Diffstat (limited to 'src/video_core')
30 files changed, 1853 insertions, 592 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 5c7f4ae18..183709d8b 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -2,7 +2,6 @@ set(SRCS renderer_opengl/generated/gl_3_2_core.c renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer_cache.cpp - renderer_opengl/gl_resource_manager.cpp renderer_opengl/gl_shader_util.cpp renderer_opengl/gl_state.cpp renderer_opengl/renderer_opengl.cpp @@ -12,8 +11,9 @@ set(SRCS pica.cpp primitive_assembly.cpp rasterizer.cpp + shader/shader.cpp + shader/shader_interpreter.cpp utils.cpp - vertex_shader.cpp video_core.cpp ) @@ -36,11 +36,20 @@ set(HEADERS primitive_assembly.h rasterizer.h renderer_base.h + shader/shader.h + shader/shader_interpreter.h utils.h - vertex_shader.h video_core.h ) +if(ARCHITECTURE_x86_64) + set(SRCS ${SRCS} + shader/shader_jit_x64.cpp) + + set(HEADERS ${HEADERS} + shader/shader_jit_x64.h) +endif() + create_directory_groups(${SRCS} ${HEADERS}) add_library(video_core STATIC ${SRCS} ${HEADERS}) diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 558b49d60..bb6048cc0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -7,7 +7,7 @@ #include "clipper.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/clipper.h b/src/video_core/clipper.h index 19ce8e140..6ed01e877 100644 --- a/src/video_core/clipper.h +++ b/src/video_core/clipper.h @@ -6,13 +6,13 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Clipper { -using VertexShader::OutputVertex; +using Shader::OutputVertex; void ProcessTriangle(OutputVertex& v0, OutputVertex& v1, OutputVertex& v2); diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index deed24412..d82e20f86 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -18,7 +18,7 @@ #include "pica.h" #include "primitive_assembly.h" #include "renderer_base.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core.h" namespace Pica { @@ -127,7 +127,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { { Common::Profiling::ScopeTimer scope_timer(category_drawing); +#if PICA_LOG_TEV DebugUtils::DumpTevStageConfig(regs.GetTevStages()); +#endif if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); @@ -170,9 +172,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { const u16* index_address_16 = (u16*)index_address_8; bool index_u16 = index_info.format != 0; +#if PICA_DUMP_GEOMETRY DebugUtils::GeometryDumper geometry_dumper; - PrimitiveAssembler<VertexShader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex> dumping_primitive_assembler(regs.triangle_topology.Value()); +#endif + PrimitiveAssembler<Shader::OutputVertex> primitive_assembler(regs.triangle_topology.Value()); if (g_debug_context) { for (int i = 0; i < 3; ++i) { @@ -213,97 +217,124 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { std::map<u32, u32> ranges; } memory_accesses; + // Simple circular-replacement vertex cache + // The size has been tuned for optimal balance between hit-rate and the cost of lookup + const size_t VERTEX_CACHE_SIZE = 32; + std::array<u16, VERTEX_CACHE_SIZE> vertex_cache_ids; + std::array<Shader::OutputVertex, VERTEX_CACHE_SIZE> vertex_cache; + + unsigned int vertex_cache_pos = 0; + vertex_cache_ids.fill(-1); + + Shader::UnitState<false> shader_unit; + Shader::Setup(shader_unit); + for (unsigned int index = 0; index < regs.num_vertices; ++index) { unsigned int vertex = is_indexed ? (index_u16 ? index_address_16[index] : index_address_8[index]) : index; + // -1 is a common special value used for primitive restart. Since it's unknown if + // the PICA supports it, and it would mess up the caching, guard against it here. + ASSERT(vertex != -1); + + bool vertex_cache_hit = false; + Shader::OutputVertex output; + if (is_indexed) { - // TODO: Implement some sort of vertex cache! if (g_debug_context && Pica::g_debug_context->recorder) { int size = index_u16 ? 2 : 1; memory_accesses.AddAccess(base_address + index_info.offset + size * index, size); } - } - // Initialize data for the current vertex - VertexShader::InputVertex input; - - // Load a debugging token to check whether this gets loaded by the running - // application or not. - static const float24 debug_token = float24::FromRawFloat24(0x00abcdef); - input.attr[0].w = debug_token; - - for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { - // Load the default attribute if we're configured to do so, this data will be overwritten by the loader data if it's set - if (attribute_config.IsDefaultAttribute(i)) { - input.attr[i] = g_state.vs.default_attributes[i]; - LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", - i, vertex, index, - input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), - input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + for (unsigned int i = 0; i < VERTEX_CACHE_SIZE; ++i) { + if (vertex == vertex_cache_ids[i]) { + output = vertex_cache[i]; + vertex_cache_hit = true; + break; + } } + } - // Load per-vertex data from the loader arrays - for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; - const u8* srcdata = Memory::GetPhysicalPointer(source_addr); - - if (g_debug_context && Pica::g_debug_context->recorder) { - memory_accesses.AddAccess(source_addr, - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 - : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1); + if (!vertex_cache_hit) { + // Initialize data for the current vertex + Shader::InputVertex input; + + for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { + if (vertex_attribute_elements[i] != 0) { + // Default attribute values set if array elements have < 4 components. This + // is *not* carried over from the default attribute settings even if they're + // enabled for this attribute. + static const float24 zero = float24::FromFloat32(0.0f); + static const float24 one = float24::FromFloat32(1.0f); + input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one); + + // Load per-vertex data from the loader arrays + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; + const u8* srcdata = Memory::GetPhysicalPointer(source_addr); + + if (g_debug_context && Pica::g_debug_context->recorder) { + memory_accesses.AddAccess(source_addr, + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 + : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1); + } + + const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata : + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata : + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata : + *(float*)srcdata; + + input.attr[i][comp] = float24::FromFloat32(srcval); + LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f", + comp, i, vertex, index, + attribute_config.GetPhysicalBaseAddress(), + vertex_attribute_sources[i] - base_address, + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i], + input.attr[i][comp].ToFloat32()); + } + } else if (attribute_config.IsDefaultAttribute(i)) { + // Load the default attribute if we're configured to do so + input.attr[i] = g_state.vs.default_attributes[i]; + LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", + i, vertex, index, + input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), + input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + } else { + // TODO(yuriks): In this case, no data gets loaded and the vertex + // remains with the last value it had. This isn't currently maintained + // as global state, however, and so won't work in Citra yet. } - - const float srcval = (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *(s8*)srcdata : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *(u8*)srcdata : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *(s16*)srcdata : - *(float*)srcdata; - - input.attr[i][comp] = float24::FromFloat32(srcval); - LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08lx + 0x%04lx: %f", - comp, i, vertex, index, - attribute_config.GetPhysicalBaseAddress(), - vertex_attribute_sources[i] - base_address, - vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i], - input.attr[i][comp].ToFloat32()); } - } - // HACK: Some games do not initialize the vertex position's w component. This leads - // to critical issues since it messes up perspective division. As a - // workaround, we force the fourth component to 1.0 if we find this to be the - // case. - // To do this, we additionally have to assume that the first input attribute - // is the vertex position, since there's no information about this other than - // the empiric observation that this is usually the case. - if (input.attr[0].w == debug_token) - input.attr[0].w = float24::FromFloat32(1.0); - - if (g_debug_context) - g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); - - // NOTE: When dumping geometry, we simply assume that the first input attribute - // corresponds to the position for now. - DebugUtils::GeometryDumper::Vertex dumped_vertex = { - input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32() - }; - using namespace std::placeholders; - dumping_primitive_assembler.SubmitVertex(dumped_vertex, - std::bind(&DebugUtils::GeometryDumper::AddTriangle, - &geometry_dumper, _1, _2, _3)); - - // Send to vertex shader - VertexShader::OutputVertex output = VertexShader::RunShader(input, attribute_config.GetNumTotalAttributes(), g_state.regs.vs, g_state.vs); + if (g_debug_context) + g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); - if (is_indexed) { - // TODO: Add processed vertex to vertex cache! +#if PICA_DUMP_GEOMETRY + // NOTE: When dumping geometry, we simply assume that the first input attribute + // corresponds to the position for now. + DebugUtils::GeometryDumper::Vertex dumped_vertex = { + input.attr[0][0].ToFloat32(), input.attr[0][1].ToFloat32(), input.attr[0][2].ToFloat32() + }; + using namespace std::placeholders; + dumping_primitive_assembler.SubmitVertex(dumped_vertex, + std::bind(&DebugUtils::GeometryDumper::AddTriangle, + &geometry_dumper, _1, _2, _3)); +#endif + // Send to vertex shader + output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); + + if (is_indexed) { + vertex_cache[vertex_cache_pos] = output; + vertex_cache_ids[vertex_cache_pos] = vertex; + vertex_cache_pos = (vertex_cache_pos + 1) % VERTEX_CACHE_SIZE; + } } if (Settings::values.use_hw_renderer) { // Send to hardware renderer - static auto AddHWTriangle = [](const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { + static auto AddHWTriangle = [](const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { VideoCore::g_renderer->hw_rasterizer->AddTriangle(v0, v1, v2); }; @@ -323,7 +354,9 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { VideoCore::g_renderer->hw_rasterizer->DrawTriangles(); } +#if PICA_DUMP_GEOMETRY geometry_dumper.Dump(); +#endif if (g_debug_context) { g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr); diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 27000283d..e4b397303 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -14,10 +14,12 @@ #include <png.h> #endif +#include <nihstro/float24.h> #include <nihstro/shader_binary.h> #include "common/assert.h" #include "common/color.h" +#include "common/common_types.h" #include "common/file_util.h" #include "common/math_util.h" #include "common/vector_math.h" @@ -90,10 +92,6 @@ void GeometryDumper::AddTriangle(Vertex& v0, Vertex& v1, Vertex& v2) { } void GeometryDumper::Dump() { - // NOTE: Permanently enabling this just trashes the hard disk for no reason. - // Hence, this is currently disabled. - return; - static int index = 0; std::string filename = std::string("geometry_dump") + std::to_string(++index) + ".obj"; @@ -113,13 +111,8 @@ void GeometryDumper::Dump() { } -void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size, - u32 main_offset, const Regs::VSOutputAttributes* output_attributes) +void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes) { - // NOTE: Permanently enabling this just trashes hard disks for no reason. - // Hence, this is currently disabled. - return; - struct StuffToWrite { u8* pointer; u32 size; @@ -138,11 +131,14 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data // into shbin format (separate type and component mask). union OutputRegisterInfo { enum Type : u64 { - POSITION = 0, - COLOR = 2, - TEXCOORD0 = 3, - TEXCOORD1 = 5, - TEXCOORD2 = 6, + POSITION = 0, + QUATERNION = 1, + COLOR = 2, + TEXCOORD0 = 3, + TEXCOORD1 = 5, + TEXCOORD2 = 6, + + VIEW = 8, }; BitField< 0, 64, u64> hex; @@ -164,6 +160,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data { OutputAttributes::POSITION_Y, { OutputRegisterInfo::POSITION, 2} }, { OutputAttributes::POSITION_Z, { OutputRegisterInfo::POSITION, 4} }, { OutputAttributes::POSITION_W, { OutputRegisterInfo::POSITION, 8} }, + { OutputAttributes::QUATERNION_X, { OutputRegisterInfo::QUATERNION, 1} }, + { OutputAttributes::QUATERNION_Y, { OutputRegisterInfo::QUATERNION, 2} }, + { OutputAttributes::QUATERNION_Z, { OutputRegisterInfo::QUATERNION, 4} }, + { OutputAttributes::QUATERNION_W, { OutputRegisterInfo::QUATERNION, 8} }, { OutputAttributes::COLOR_R, { OutputRegisterInfo::COLOR, 1} }, { OutputAttributes::COLOR_G, { OutputRegisterInfo::COLOR, 2} }, { OutputAttributes::COLOR_B, { OutputRegisterInfo::COLOR, 4} }, @@ -173,7 +173,10 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data { OutputAttributes::TEXCOORD1_U, { OutputRegisterInfo::TEXCOORD1, 1} }, { OutputAttributes::TEXCOORD1_V, { OutputRegisterInfo::TEXCOORD1, 2} }, { OutputAttributes::TEXCOORD2_U, { OutputRegisterInfo::TEXCOORD2, 1} }, - { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} } + { OutputAttributes::TEXCOORD2_V, { OutputRegisterInfo::TEXCOORD2, 2} }, + { OutputAttributes::VIEW_X, { OutputRegisterInfo::VIEW, 1} }, + { OutputAttributes::VIEW_Y, { OutputRegisterInfo::VIEW, 2} }, + { OutputAttributes::VIEW_Z, { OutputRegisterInfo::VIEW, 4} } }; for (const auto& semantic : std::vector<OutputAttributes::Semantic>{ @@ -228,28 +231,69 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data // TODO: Reduce the amount of binary code written to relevant portions dvlp.binary_offset = write_offset - dvlp_offset; - dvlp.binary_size_words = binary_size; - QueueForWriting((u8*)binary_data, binary_size * sizeof(u32)); + dvlp.binary_size_words = setup.program_code.size(); + QueueForWriting((u8*)setup.program_code.data(), setup.program_code.size() * sizeof(u32)); dvlp.swizzle_info_offset = write_offset - dvlp_offset; - dvlp.swizzle_info_num_entries = swizzle_size; + dvlp.swizzle_info_num_entries = setup.swizzle_data.size(); u32 dummy = 0; - for (unsigned int i = 0; i < swizzle_size; ++i) { - QueueForWriting((u8*)&swizzle_data[i], sizeof(swizzle_data[i])); + for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) { + QueueForWriting((u8*)&setup.swizzle_data[i], sizeof(setup.swizzle_data[i])); QueueForWriting((u8*)&dummy, sizeof(dummy)); } - dvle.main_offset_words = main_offset; + dvle.main_offset_words = config.main_offset; dvle.output_register_table_offset = write_offset - dvlb.dvle_offset; - dvle.output_register_table_size = static_cast<uint32_t>(output_info_table.size()); + dvle.output_register_table_size = static_cast<u32>(output_info_table.size()); QueueForWriting((u8*)output_info_table.data(), static_cast<u32>(output_info_table.size() * sizeof(OutputRegisterInfo))); // TODO: Create a label table for "main" + std::vector<nihstro::ConstantInfo> constant_table; + for (unsigned i = 0; i < setup.uniforms.b.size(); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Bool; + constant.regid = i; + constant.b = setup.uniforms.b[i]; + constant_table.emplace_back(constant); + } + for (unsigned i = 0; i < setup.uniforms.i.size(); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Int; + constant.regid = i; + constant.i.x = setup.uniforms.i[i].x; + constant.i.y = setup.uniforms.i[i].y; + constant.i.z = setup.uniforms.i[i].z; + constant.i.w = setup.uniforms.i[i].w; + constant_table.emplace_back(constant); + } + for (unsigned i = 0; i < sizeof(setup.uniforms.f) / sizeof(setup.uniforms.f[0]); ++i) { + nihstro::ConstantInfo constant; + memset(&constant, 0, sizeof(constant)); + constant.type = nihstro::ConstantInfo::Float; + constant.regid = i; + constant.f.x = nihstro::to_float24(setup.uniforms.f[i].x.ToFloat32()); + constant.f.y = nihstro::to_float24(setup.uniforms.f[i].y.ToFloat32()); + constant.f.z = nihstro::to_float24(setup.uniforms.f[i].z.ToFloat32()); + constant.f.w = nihstro::to_float24(setup.uniforms.f[i].w.ToFloat32()); + + // Store constant if it's different from zero.. + if (setup.uniforms.f[i].x.ToFloat32() != 0.0 || + setup.uniforms.f[i].y.ToFloat32() != 0.0 || + setup.uniforms.f[i].z.ToFloat32() != 0.0 || + setup.uniforms.f[i].w.ToFloat32() != 0.0) + constant_table.emplace_back(constant); + } + dvle.constant_table_offset = write_offset - dvlb.dvle_offset; + dvle.constant_table_size = constant_table.size(); + for (const auto& constant : constant_table) { + QueueForWriting((uint8_t*)&constant, sizeof(constant)); + } // Write data to file static int dump_index = 0; - std::string filename = std::string("shader_dump") + std::to_string(++dump_index) + std::string(".shbin"); std::ofstream file(filename, std::ios_base::out | std::ios_base::binary); for (auto& chunk : writing_queue) { @@ -564,10 +608,6 @@ TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config, } void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) { - // NOTE: Permanently enabling this just trashes hard disks for no reason. - // Hence, this is currently disabled. - return; - #ifndef HAVE_PNG return; #else diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index acb75a4b2..85762f5b4 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -157,6 +157,10 @@ extern std::shared_ptr<DebugContext> g_debug_context; // TODO: Get rid of this g namespace DebugUtils { +#define PICA_DUMP_GEOMETRY 0 +#define PICA_DUMP_TEXTURES 0 +#define PICA_LOG_TEV 0 + // Simple utility class for dumping geometry data to an OBJ file class GeometryDumper { public: @@ -177,8 +181,8 @@ private: std::vector<Face> faces; }; -void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data, u32 swizzle_size, - u32 main_offset, const Regs::VSOutputAttributes* output_attributes); +void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, + const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes); // Utility class to log Pica commands. diff --git a/src/video_core/hwrasterizer_base.h b/src/video_core/hwrasterizer_base.h index c8746c608..54b8892fb 100644 --- a/src/video_core/hwrasterizer_base.h +++ b/src/video_core/hwrasterizer_base.h @@ -7,7 +7,7 @@ #include "common/common_types.h" namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } } @@ -24,9 +24,9 @@ public: virtual void Reset() = 0; /// Queues the primitive formed by the given vertices for rendering - virtual void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) = 0; + virtual void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) = 0; /// Draw the current batch of triangles virtual void DrawTriangles() = 0; diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index 543d9c443..c73a8178e 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp @@ -2,18 +2,91 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <string.h> +#include <cstring> +#include <unordered_map> #include "pica.h" +#include "shader/shader.h" namespace Pica { State g_state; +std::string Regs::GetCommandName(int index) { + static std::unordered_map<u32, std::string> map; + + if (map.empty()) { + #define ADD_FIELD(name) \ + map.insert({static_cast<u32>(PICA_REG_INDEX(name)), #name}); \ + /* TODO: change to Regs::name when VS2015 and other compilers support it */ \ + for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \ + map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))}); \ + + ADD_FIELD(trigger_irq); + ADD_FIELD(cull_mode); + ADD_FIELD(viewport_size_x); + ADD_FIELD(viewport_size_y); + ADD_FIELD(viewport_depth_range); + ADD_FIELD(viewport_depth_far_plane); + ADD_FIELD(viewport_corner); + ADD_FIELD(texture0_enable); + ADD_FIELD(texture0); + ADD_FIELD(texture0_format); + ADD_FIELD(texture1); + ADD_FIELD(texture1_format); + ADD_FIELD(texture2); + ADD_FIELD(texture2_format); + ADD_FIELD(tev_stage0); + ADD_FIELD(tev_stage1); + ADD_FIELD(tev_stage2); + ADD_FIELD(tev_stage3); + ADD_FIELD(tev_combiner_buffer_input); + ADD_FIELD(tev_stage4); + ADD_FIELD(tev_stage5); + ADD_FIELD(tev_combiner_buffer_color); + ADD_FIELD(output_merger); + ADD_FIELD(framebuffer); + ADD_FIELD(vertex_attributes); + ADD_FIELD(index_array); + ADD_FIELD(num_vertices); + ADD_FIELD(trigger_draw); + ADD_FIELD(trigger_draw_indexed); + ADD_FIELD(vs_default_attributes_setup); + ADD_FIELD(command_buffer); + ADD_FIELD(triangle_topology); + ADD_FIELD(gs.bool_uniforms); + ADD_FIELD(gs.int_uniforms); + ADD_FIELD(gs.main_offset); + ADD_FIELD(gs.input_register_map); + ADD_FIELD(gs.uniform_setup); + ADD_FIELD(gs.program); + ADD_FIELD(gs.swizzle_patterns); + ADD_FIELD(vs.bool_uniforms); + ADD_FIELD(vs.int_uniforms); + ADD_FIELD(vs.main_offset); + ADD_FIELD(vs.input_register_map); + ADD_FIELD(vs.uniform_setup); + ADD_FIELD(vs.program); + ADD_FIELD(vs.swizzle_patterns); + +#undef ADD_FIELD + } + + // Return empty string if no match is found + auto it = map.find(index); + if (it != map.end()) { + return it->second; + } else { + return std::string(); + } +} + void Init() { } void Shutdown() { + Shader::Shutdown(); + memset(&g_state, 0, sizeof(State)); } diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 38599a7a3..36916f862 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -7,7 +7,6 @@ #include <array> #include <cmath> #include <cstddef> -#include <map> #include <string> #include "common/assert.h" @@ -81,6 +80,11 @@ struct Regs { POSITION_Z = 2, POSITION_W = 3, + QUATERNION_X = 4, + QUATERNION_Y = 5, + QUATERNION_Z = 6, + QUATERNION_W = 7, + COLOR_R = 8, COLOR_G = 9, COLOR_B = 10, @@ -90,6 +94,12 @@ struct Regs { TEXCOORD0_V = 13, TEXCOORD1_U = 14, TEXCOORD1_V = 15, + + // TODO: Not verified + VIEW_X = 18, + VIEW_Y = 19, + VIEW_Z = 20, + TEXCOORD2_U = 22, TEXCOORD2_V = 23, @@ -908,69 +918,7 @@ struct Regs { // Map register indices to names readable by humans // Used for debugging purposes, so performance is not an issue here - static std::string GetCommandName(int index) { - std::map<u32, std::string> map; - - #define ADD_FIELD(name) \ - do { \ - map.insert({static_cast<u32>(PICA_REG_INDEX(name)), #name}); \ - /* TODO: change to Regs::name when VS2015 and other compilers support it */ \ - for (u32 i = PICA_REG_INDEX(name) + 1; i < PICA_REG_INDEX(name) + sizeof(Regs().name) / 4; ++i) \ - map.insert({i, #name + std::string("+") + std::to_string(i-PICA_REG_INDEX(name))}); \ - } while(false) - - ADD_FIELD(trigger_irq); - ADD_FIELD(cull_mode); - ADD_FIELD(viewport_size_x); - ADD_FIELD(viewport_size_y); - ADD_FIELD(viewport_depth_range); - ADD_FIELD(viewport_depth_far_plane); - ADD_FIELD(viewport_corner); - ADD_FIELD(texture0_enable); - ADD_FIELD(texture0); - ADD_FIELD(texture0_format); - ADD_FIELD(texture1); - ADD_FIELD(texture1_format); - ADD_FIELD(texture2); - ADD_FIELD(texture2_format); - ADD_FIELD(tev_stage0); - ADD_FIELD(tev_stage1); - ADD_FIELD(tev_stage2); - ADD_FIELD(tev_stage3); - ADD_FIELD(tev_combiner_buffer_input); - ADD_FIELD(tev_stage4); - ADD_FIELD(tev_stage5); - ADD_FIELD(tev_combiner_buffer_color); - ADD_FIELD(output_merger); - ADD_FIELD(framebuffer); - ADD_FIELD(vertex_attributes); - ADD_FIELD(index_array); - ADD_FIELD(num_vertices); - ADD_FIELD(trigger_draw); - ADD_FIELD(trigger_draw_indexed); - ADD_FIELD(vs_default_attributes_setup); - ADD_FIELD(command_buffer); - ADD_FIELD(triangle_topology); - ADD_FIELD(gs.bool_uniforms); - ADD_FIELD(gs.int_uniforms); - ADD_FIELD(gs.main_offset); - ADD_FIELD(gs.input_register_map); - ADD_FIELD(gs.uniform_setup); - ADD_FIELD(gs.program); - ADD_FIELD(gs.swizzle_patterns); - ADD_FIELD(vs.bool_uniforms); - ADD_FIELD(vs.int_uniforms); - ADD_FIELD(vs.main_offset); - ADD_FIELD(vs.input_register_map); - ADD_FIELD(vs.uniform_setup); - ADD_FIELD(vs.program); - ADD_FIELD(vs.swizzle_patterns); - - #undef ADD_FIELD - - // Return empty string if no match is found - return map[index]; - } + static std::string GetCommandName(int index); static inline size_t NumIds() { return sizeof(Regs) / sizeof(u32); @@ -1146,6 +1094,7 @@ private: // TODO: Perform proper arithmetic on this! float value; }; +static_assert(sizeof(float24) == sizeof(float), "Shader JIT assumes float24 is implemented as a 32-bit float"); /// Struct used to describe current Pica state struct State { @@ -1155,7 +1104,10 @@ struct State { /// Vertex shader memory struct ShaderSetup { struct { - Math::Vec4<float24> f[96]; + // The float uniforms are accessed by the shader JIT using SSE instructions, and are + // therefore required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(f[96]); + std::array<bool, 16> b; std::array<Math::Vec4<u8>, 4> i; } uniforms; diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp index 2f22bdcce..e2b1df44c 100644 --- a/src/video_core/primitive_assembly.cpp +++ b/src/video_core/primitive_assembly.cpp @@ -4,7 +4,7 @@ #include "pica.h" #include "primitive_assembly.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "common/logging/log.h" #include "video_core/debug_utils/debug_utils.h" @@ -56,7 +56,7 @@ void PrimitiveAssembler<VertexType>::SubmitVertex(VertexType& vtx, TriangleHandl // explicitly instantiate use cases template -struct PrimitiveAssembler<VertexShader::OutputVertex>; +struct PrimitiveAssembler<Shader::OutputVertex>; template struct PrimitiveAssembler<DebugUtils::GeometryDumper::Vertex>; diff --git a/src/video_core/primitive_assembly.h b/src/video_core/primitive_assembly.h index 52ff4cd89..80432d68a 100644 --- a/src/video_core/primitive_assembly.h +++ b/src/video_core/primitive_assembly.h @@ -8,7 +8,7 @@ #include "video_core/pica.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" namespace Pica { diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index e2b90ad1c..b83798b0f 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -16,7 +16,7 @@ #include "math.h" #include "pica.h" #include "rasterizer.h" -#include "vertex_shader.h" +#include "shader/shader_interpreter.h" #include "video_core/utils.h" namespace Pica { @@ -272,9 +272,9 @@ static Common::Profiling::TimingCategory rasterization_category("Rasterization") * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing * culling via recursion. */ -static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2, +static void ProcessTriangleInternal(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2, bool reversed = false) { const auto& regs = g_state.regs; @@ -462,7 +462,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, // TODO: Apply the min and mag filters to the texture texture_color[i] = DebugUtils::LookupTexture(texture_data, s, t, info); +#if PICA_DUMP_TEXTURES DebugUtils::DumpTexture(texture.config, texture_data); +#endif } } @@ -1105,9 +1107,9 @@ static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0, } } -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2) { +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2) { ProcessTriangleInternal(v0, v1, v2); } diff --git a/src/video_core/rasterizer.h b/src/video_core/rasterizer.h index 42148f8b1..a6a9634b4 100644 --- a/src/video_core/rasterizer.h +++ b/src/video_core/rasterizer.h @@ -6,15 +6,15 @@ namespace Pica { -namespace VertexShader { +namespace Shader { struct OutputVertex; } namespace Rasterizer { -void ProcessTriangle(const VertexShader::OutputVertex& v0, - const VertexShader::OutputVertex& v1, - const VertexShader::OutputVertex& v2); +void ProcessTriangle(const Shader::OutputVertex& v0, + const Shader::OutputVertex& v1, + const Shader::OutputVertex& v2); } // namespace Rasterizer diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 2db845da6..9f1552adf 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -7,6 +7,7 @@ #include "common/color.h" #include "common/math_util.h" +#include "common/profiler.h" #include "core/hw/gpu.h" #include "core/memory.h" @@ -98,7 +99,6 @@ void RasterizerOpenGL::InitObjects() { fb_color_texture.texture.Create(); ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -114,7 +114,6 @@ void RasterizerOpenGL::InitObjects() { fb_depth_texture.texture.Create(); ReconfigureDepthTexture(fb_depth_texture, Pica::Regs::DepthFormat::D16, 1, 1); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -203,9 +202,9 @@ void RasterizerOpenGL::Reset() { res_cache.FullFlush(); } -void RasterizerOpenGL::AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) { +void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) { vertex_batch.push_back(HardwareVertex(v0)); vertex_batch.push_back(HardwareVertex(v1)); vertex_batch.push_back(HardwareVertex(v2)); @@ -492,7 +491,6 @@ void RasterizerOpenGL::ReconfigureColorTexture(TextureInfo& texture, Pica::Regs: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -536,7 +534,6 @@ void RasterizerOpenGL::ReconfigureDepthTexture(DepthTextureInfo& texture, Pica:: break; } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.texture.handle; state.Apply(); @@ -765,10 +762,9 @@ void RasterizerOpenGL::SyncDrawState() { const auto& texture = pica_textures[texture_index]; if (texture.enabled) { - state.texture_units[texture_index].enabled_2d = true; res_cache.LoadAndBindTexture(state, texture_index, texture); } else { - state.texture_units[texture_index].enabled_2d = false; + state.texture_units[texture_index].texture_2d = 0; } } @@ -803,7 +799,6 @@ void RasterizerOpenGL::ReloadColorBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -861,7 +856,6 @@ void RasterizerOpenGL::ReloadDepthBuffer() { } } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); @@ -873,16 +867,19 @@ void RasterizerOpenGL::ReloadDepthBuffer() { state.Apply(); } +Common::Profiling::TimingCategory buffer_commit_category("Framebuffer Commit"); + void RasterizerOpenGL::CommitColorBuffer() { if (last_fb_color_addr != 0) { u8* color_buffer = Memory::GetPhysicalPointer(last_fb_color_addr); if (color_buffer != nullptr) { + Common::Profiling::ScopeTimer timer(buffer_commit_category); + u32 bytes_per_pixel = Pica::Regs::BytesPerColorPixel(fb_color_texture.format); std::unique_ptr<u8[]> temp_gl_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -913,6 +910,8 @@ void RasterizerOpenGL::CommitDepthBuffer() { u8* depth_buffer = Memory::GetPhysicalPointer(last_fb_depth_addr); if (depth_buffer != nullptr) { + Common::Profiling::ScopeTimer timer(buffer_commit_category); + u32 bytes_per_pixel = Pica::Regs::BytesPerDepthPixel(fb_depth_texture.format); // OpenGL needs 4 bpp alignment for D24 @@ -920,7 +919,6 @@ void RasterizerOpenGL::CommitDepthBuffer() { std::unique_ptr<u8[]> temp_gl_depth_buffer(new u8[fb_depth_texture.width * fb_depth_texture.height * gl_bpp]); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = fb_depth_texture.texture.handle; state.Apply(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index ae7b26fc6..a02d5c856 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -9,7 +9,7 @@ #include "common/common_types.h" #include "video_core/hwrasterizer_base.h" -#include "video_core/vertex_shader.h" +#include "video_core/shader/shader_interpreter.h" #include "gl_state.h" #include "gl_rasterizer_cache.h" @@ -27,9 +27,9 @@ public: void Reset() override; /// Queues the primitive formed by the given vertices for rendering - void AddTriangle(const Pica::VertexShader::OutputVertex& v0, - const Pica::VertexShader::OutputVertex& v1, - const Pica::VertexShader::OutputVertex& v2) override; + void AddTriangle(const Pica::Shader::OutputVertex& v0, + const Pica::Shader::OutputVertex& v1, + const Pica::Shader::OutputVertex& v2) override; /// Draw the current batch of triangles void DrawTriangles() override; @@ -82,7 +82,7 @@ private: /// Structure that the hardware rendered vertices are composed of struct HardwareVertex { - HardwareVertex(const Pica::VertexShader::OutputVertex& v) { + HardwareVertex(const Pica::Shader::OutputVertex& v) { position[0] = v.pos.x.ToFloat32(); position[1] = v.pos.y.ToFloat32(); position[2] = v.pos.z.ToFloat32(); diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index dc3ffdf22..70f0ba5f1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -30,6 +30,7 @@ void RasterizerCacheOpenGL::LoadAndBindTexture(OpenGLState &state, unsigned text new_texture->texture.Create(); state.texture_units[texture_unit].texture_2d = new_texture->texture.handle; state.Apply(); + glActiveTexture(GL_TEXTURE0 + texture_unit); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, PicaToGL::TextureFilterMode(config.config.mag_filter)); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, PicaToGL::TextureFilterMode(config.config.min_filter)); diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp deleted file mode 100644 index 8f4ae28a4..000000000 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_shader_util.h" - -// Textures -OGLTexture::OGLTexture() : handle(0) { -} - -OGLTexture::~OGLTexture() { - Release(); -} - -void OGLTexture::Create() { - if (handle != 0) { - return; - } - - glGenTextures(1, &handle); -} - -void OGLTexture::Release() { - glDeleteTextures(1, &handle); - handle = 0; -} - -// Shaders -OGLShader::OGLShader() : handle(0) { -} - -OGLShader::~OGLShader() { - Release(); -} - -void OGLShader::Create(const char* vert_shader, const char* frag_shader) { - if (handle != 0) { - return; - } - - handle = ShaderUtil::LoadShaders(vert_shader, frag_shader); -} - -void OGLShader::Release() { - glDeleteProgram(handle); - handle = 0; -} - -// Buffer objects -OGLBuffer::OGLBuffer() : handle(0) { -} - -OGLBuffer::~OGLBuffer() { - Release(); -} - -void OGLBuffer::Create() { - if (handle != 0) { - return; - } - - glGenBuffers(1, &handle); -} - -void OGLBuffer::Release() { - glDeleteBuffers(1, &handle); - handle = 0; -} - -// Vertex array objects -OGLVertexArray::OGLVertexArray() : handle(0) { -} - -OGLVertexArray::~OGLVertexArray() { - Release(); -} - -void OGLVertexArray::Create() { - if (handle != 0) { - return; - } - - glGenVertexArrays(1, &handle); -} - -void OGLVertexArray::Release() { - glDeleteVertexArrays(1, &handle); - handle = 0; -} - -// Framebuffers -OGLFramebuffer::OGLFramebuffer() : handle(0) { -} - -OGLFramebuffer::~OGLFramebuffer() { - Release(); -} - -void OGLFramebuffer::Create() { - if (handle != 0) { - return; - } - - glGenFramebuffers(1, &handle); -} - -void OGLFramebuffer::Release() { - glDeleteFramebuffers(1, &handle); - handle = 0; -} diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index 975720d0a..82173d59a 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -4,76 +4,130 @@ #pragma once +#include <utility> + #include "common/common_types.h" -#include "generated/gl_3_2_core.h" +#include "video_core/renderer_opengl/generated/gl_3_2_core.h" +#include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_state.h" -class OGLTexture : public NonCopyable { +class OGLTexture : private NonCopyable { public: - OGLTexture(); - ~OGLTexture(); + OGLTexture() = default; + OGLTexture(OGLTexture&& o) { std::swap(handle, o.handle); } + ~OGLTexture() { Release(); } + OGLTexture& operator=(OGLTexture&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenTextures(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteTextures(1, &handle); + OpenGLState::ResetTexture(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLShader : public NonCopyable { +class OGLShader : private NonCopyable { public: - OGLShader(); - ~OGLShader(); + OGLShader() = default; + OGLShader(OGLShader&& o) { std::swap(handle, o.handle); } + ~OGLShader() { Release(); } + OGLShader& operator=(OGLShader&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(const char* vert_shader, const char* frag_shader); + void Create(const char* vert_shader, const char* frag_shader) { + if (handle != 0) return; + handle = ShaderUtil::LoadShaders(vert_shader, frag_shader); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteProgram(handle); + OpenGLState::ResetProgram(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLBuffer : public NonCopyable { +class OGLBuffer : private NonCopyable { public: - OGLBuffer(); - ~OGLBuffer(); + OGLBuffer() = default; + OGLBuffer(OGLBuffer&& o) { std::swap(handle, o.handle); } + ~OGLBuffer() { Release(); } + OGLBuffer& operator=(OGLBuffer&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenBuffers(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteBuffers(1, &handle); + OpenGLState::ResetBuffer(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLVertexArray : public NonCopyable { +class OGLVertexArray : private NonCopyable { public: - OGLVertexArray(); - ~OGLVertexArray(); + OGLVertexArray() = default; + OGLVertexArray(OGLVertexArray&& o) { std::swap(handle, o.handle); } + ~OGLVertexArray() { Release(); } + OGLVertexArray& operator=(OGLVertexArray&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenVertexArrays(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteVertexArrays(1, &handle); + OpenGLState::ResetVertexArray(handle); + handle = 0; + } + + GLuint handle = 0; }; -class OGLFramebuffer : public NonCopyable { +class OGLFramebuffer : private NonCopyable { public: - OGLFramebuffer(); - ~OGLFramebuffer(); + OGLFramebuffer() = default; + OGLFramebuffer(OGLFramebuffer&& o) { std::swap(handle, o.handle); } + ~OGLFramebuffer() { Release(); } + OGLFramebuffer& operator=(OGLFramebuffer&& o) { std::swap(handle, o.handle); return *this; } /// Creates a new internal OpenGL resource and stores the handle - void Create(); + void Create() { + if (handle != 0) return; + glGenFramebuffers(1, &handle); + } /// Deletes the internal OpenGL resource - void Release(); - - GLuint handle; + void Release() { + if (handle == 0) return; + glDeleteFramebuffers(1, &handle); + OpenGLState::ResetFramebuffer(handle); + handle = 0; + } + + GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 9efc15337..871324014 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -40,7 +40,6 @@ OpenGLState::OpenGLState() { logic_op = GL_COPY; for (auto& texture_unit : texture_units) { - texture_unit.enabled_2d = false; texture_unit.texture_2d = 0; } @@ -147,16 +146,9 @@ void OpenGLState::Apply() { // Textures for (unsigned texture_index = 0; texture_index < ARRAY_SIZE(texture_units); ++texture_index) { - if (texture_units[texture_index].enabled_2d != cur_state.texture_units[texture_index].enabled_2d || - texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { - + if (texture_units[texture_index].texture_2d != cur_state.texture_units[texture_index].texture_2d) { glActiveTexture(GL_TEXTURE0 + texture_index); - - if (texture_units[texture_index].enabled_2d) { - glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); - } else { - glBindTexture(GL_TEXTURE_2D, 0); - } + glBindTexture(GL_TEXTURE_2D, texture_units[texture_index].texture_2d); } } @@ -182,3 +174,35 @@ void OpenGLState::Apply() { cur_state = *this; } + +void OpenGLState::ResetTexture(GLuint id) { + for (auto& unit : cur_state.texture_units) { + if (unit.texture_2d == id) { + unit.texture_2d = 0; + } + } +} + +void OpenGLState::ResetProgram(GLuint id) { + if (cur_state.draw.shader_program == id) { + cur_state.draw.shader_program = 0; + } +} + +void OpenGLState::ResetBuffer(GLuint id) { + if (cur_state.draw.vertex_buffer == id) { + cur_state.draw.vertex_buffer = 0; + } +} + +void OpenGLState::ResetVertexArray(GLuint id) { + if (cur_state.draw.vertex_array == id) { + cur_state.draw.vertex_array = 0; + } +} + +void OpenGLState::ResetFramebuffer(GLuint id) { + if (cur_state.draw.framebuffer == id) { + cur_state.draw.framebuffer = 0; + } +} diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index 26b916360..3e2379021 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -53,7 +53,6 @@ public: // 3 texture units - one for each that is used in PICA fragment shader emulation struct { - bool enabled_2d; // GL_TEXTURE_2D GLuint texture_2d; // GL_TEXTURE_BINDING_2D } texture_units[3]; @@ -74,6 +73,12 @@ public: /// Apply this state as the current OpenGL state void Apply(); + static void ResetTexture(GLuint id); + static void ResetProgram(GLuint id); + static void ResetBuffer(GLuint id); + static void ResetVertexArray(GLuint id); + static void ResetFramebuffer(GLuint id); + private: static OpenGLState cur_state; }; diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 96e12839a..79a940ff6 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -163,7 +163,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& // only allows rows to have a memory alignement of 4. ASSERT(pixel_stride % 4 == 0); - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -191,7 +190,6 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig& */ void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, const TextureInfo& texture) { - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -239,7 +237,6 @@ void RendererOpenGL::InitOpenGLObjects() { // Allocation of storage is deferred until the first frame, when we // know the framebuffer size. - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -305,7 +302,6 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, UNIMPLEMENTED(); } - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); @@ -325,7 +321,6 @@ void RendererOpenGL::DrawSingleScreenRotated(const TextureInfo& texture, float x ScreenRectVertex(x+w, y+h, 0.f, 1.f), }; - state.texture_units[0].enabled_2d = true; state.texture_units[0].texture_2d = texture.handle; state.Apply(); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp new file mode 100644 index 000000000..4e9836c80 --- /dev/null +++ b/src/video_core/shader/shader.cpp @@ -0,0 +1,180 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <memory> +#include <unordered_map> + +#include <boost/range/algorithm/fill.hpp> + +#include "common/hash.h" +#include "common/make_unique.h" +#include "common/profiler.h" + +#include "video_core/debug_utils/debug_utils.h" +#include "video_core/pica.h" +#include "video_core/video_core.h" + +#include "shader.h" +#include "shader_interpreter.h" + +#ifdef ARCHITECTURE_x86_64 +#include "shader_jit_x64.h" +#endif // ARCHITECTURE_x86_64 + +namespace Pica { + +namespace Shader { + +#ifdef ARCHITECTURE_x86_64 +static std::unordered_map<u64, CompiledShader*> shader_map; +static JitCompiler jit; +static CompiledShader* jit_shader; +#endif // ARCHITECTURE_x86_64 + +void Setup(UnitState<false>& state) { +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) { + u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ + g_state.regs.vs.main_offset); + + auto iter = shader_map.find(cache_key); + if (iter != shader_map.end()) { + jit_shader = iter->second; + } else { + jit_shader = jit.Compile(); + shader_map.emplace(cache_key, jit_shader); + } + } +#endif // ARCHITECTURE_x86_64 +} + +void Shutdown() { + shader_map.clear(); +} + +static Common::Profiling::TimingCategory shader_category("Vertex Shader"); + +OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { + auto& config = g_state.regs.vs; + + Common::Profiling::ScopeTimer timer(shader_category); + + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + + // TODO: Instead of this cumbersome logic, just load the input data directly like + // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; } + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0]; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1]; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2]; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = input.attr[3]; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = input.attr[4]; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = input.attr[5]; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = input.attr[6]; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = input.attr[7]; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = input.attr[8]; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = input.attr[9]; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = input.attr[10]; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = input.attr[11]; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = input.attr[12]; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = input.attr[13]; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = input.attr[14]; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = input.attr[15]; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + +#ifdef ARCHITECTURE_x86_64 + if (VideoCore::g_shader_jit_enabled) + jit_shader(&state.registers); + else + RunInterpreter(state); +#else + RunInterpreter(state); +#endif // ARCHITECTURE_x86_64 + + // Setup output data + OutputVertex ret; + // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to + // figure out what those circumstances are and enable the remaining outputs then. + for (int i = 0; i < 7; ++i) { + const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here + + u32 semantics[4] = { + output_register_map.map_x, output_register_map.map_y, + output_register_map.map_z, output_register_map.map_w + }; + + for (int comp = 0; comp < 4; ++comp) { + float24* out = ((float24*)&ret) + semantics[comp]; + if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { + *out = state.registers.output[i][comp]; + } else { + // Zero output so that attributes which aren't output won't have denormals in them, + // which would slow us down later. + memset(out, 0, sizeof(*out)); + } + } + } + + // The hardware takes the absolute and saturates vertex colors like this, *before* doing interpolation + for (int i = 0; i < 4; ++i) { + ret.color[i] = float24::FromFloat32( + std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f)); + } + + LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", + ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), + ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), + ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), + ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); + + return ret; +} + +DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { + UnitState<true> state; + + const auto& shader_memory = setup.program_code; + state.program_counter = config.main_offset; + state.debug.max_offset = 0; + state.debug.max_opdesc_id = 0; + + // Setup input register table + const auto& attribute_register_map = config.input_register_map; + float24 dummy_register; + boost::fill(state.registers.input, &dummy_register); + + if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x; + if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x; + if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x; + if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x; + if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x; + if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x; + if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x; + if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x; + if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x; + if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x; + if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x; + if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x; + if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x; + if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x; + if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x; + if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x; + + state.conditional_code[0] = false; + state.conditional_code[1] = false; + + RunInterpreter(state); + return state.debug; +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h new file mode 100644 index 000000000..58d21f7cd --- /dev/null +++ b/src/video_core/shader/shader.h @@ -0,0 +1,352 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> + +#include <boost/container/static_vector.hpp> + +#include <nihstro/shader_binary.h> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/vector_math.h" + +#include "video_core/pica.h" + +using nihstro::RegisterType; +using nihstro::SourceRegister; +using nihstro::DestRegister; + +namespace Pica { + +namespace Shader { + +struct InputVertex { + Math::Vec4<float24> attr[16]; +}; + +struct OutputVertex { + OutputVertex() = default; + + // VS output attributes + Math::Vec4<float24> pos; + Math::Vec4<float24> dummy; // quaternions (not implemented, yet) + Math::Vec4<float24> color; + Math::Vec2<float24> tc0; + Math::Vec2<float24> tc1; + float24 pad[6]; + Math::Vec2<float24> tc2; + + // Padding for optimal alignment + float24 pad2[4]; + + // Attributes used to store intermediate results + + // position after perspective divide + Math::Vec3<float24> screenpos; + float24 pad3; + + // Linear interpolation + // factor: 0=this, 1=vtx + void Lerp(float24 factor, const OutputVertex& vtx) { + pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); + + // TODO: Should perform perspective correct interpolation here... + tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); + tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); + tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); + + screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); + + color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); + } + + // Linear interpolation + // factor: 0=v0, 1=v1 + static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { + OutputVertex ret = v0; + ret.Lerp(factor, v1); + return ret; + } +}; +static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); +static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); + + +// Helper structure used to keep track of data useful for inspection of shader emulation +template<bool full_debugging> +struct DebugData; + +template<> +struct DebugData<false> { + // TODO: Hide these behind and interface and move them to DebugData<true> + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used +}; + +template<> +struct DebugData<true> { + // Records store the input and output operands of a particular instruction. + struct Record { + enum Type { + // Floating point arithmetic operands + SRC1 = 0x1, + SRC2 = 0x2, + SRC3 = 0x4, + + // Initial and final output operand value + DEST_IN = 0x8, + DEST_OUT = 0x10, + + // Current and next instruction offset (in words) + CUR_INSTR = 0x20, + NEXT_INSTR = 0x40, + + // Output address register value + ADDR_REG_OUT = 0x80, + + // Result of a comparison instruction + CMP_RESULT = 0x100, + + // Input values for conditional flow control instructions + COND_BOOL_IN = 0x200, + COND_CMP_IN = 0x400, + + // Input values for a loop + LOOP_INT_IN = 0x800, + }; + + Math::Vec4<float24> src1; + Math::Vec4<float24> src2; + Math::Vec4<float24> src3; + + Math::Vec4<float24> dest_in; + Math::Vec4<float24> dest_out; + + s32 address_registers[2]; + bool conditional_code[2]; + bool cond_bool; + bool cond_cmp[2]; + Math::Vec4<u8> loop_int; + + u32 instruction_offset; + u32 next_instruction; + + // set of enabled fields (as a combination of Type flags) + unsigned mask = 0; + }; + + u32 max_offset; // maximum program counter ever reached + u32 max_opdesc_id; // maximum swizzle pattern index ever used + + // List of records for each executed shader instruction + std::vector<DebugData<true>::Record> records; +}; + +// Type alias for better readability +using DebugDataRecord = DebugData<true>::Record; + +// Helper function to set a DebugData<true>::Record field based on the template enum parameter. +template<DebugDataRecord::Type type, typename ValueType> +inline void SetField(DebugDataRecord& record, ValueType value); + +template<> +inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) { + record.src1.x = value[0]; + record.src1.y = value[1]; + record.src1.z = value[2]; + record.src1.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) { + record.src2.x = value[0]; + record.src2.y = value[1]; + record.src2.z = value[2]; + record.src2.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) { + record.src3.x = value[0]; + record.src3.y = value[1]; + record.src3.z = value[2]; + record.src3.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) { + record.dest_in.x = value[0]; + record.dest_in.y = value[1]; + record.dest_in.z = value[2]; + record.dest_in.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) { + record.dest_out.x = value[0]; + record.dest_out.y = value[1]; + record.dest_out.z = value[2]; + record.dest_out.w = value[3]; +} + +template<> +inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) { + record.address_registers[0] = value[0]; + record.address_registers[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) { + record.conditional_code[0] = value[0]; + record.conditional_code[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) { + record.cond_bool = value; +} + +template<> +inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) { + record.cond_cmp[0] = value[0]; + record.cond_cmp[1] = value[1]; +} + +template<> +inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) { + record.loop_int = value; +} + +template<> +inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) { + record.instruction_offset = value; +} + +template<> +inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) { + record.next_instruction = value; +} + +// Helper function to set debug information on the current shader iteration. +template<DebugDataRecord::Type type, typename ValueType> +inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) { + // Debugging disabled => nothing to do +} + +template<DebugDataRecord::Type type, typename ValueType> +inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) { + if (offset >= debug_data.records.size()) + debug_data.records.resize(offset + 1); + + SetField<type, ValueType>(debug_data.records[offset], value); + debug_data.records[offset].mask |= type; +} + + +/** + * This structure contains the state information that needs to be unique for a shader unit. The 3DS + * has four shader units that process shaders in parallel. At the present, Citra only implements a + * single shader unit that processes all shaders serially. Putting the state information in a struct + * here will make it easier for us to parallelize the shader processing later. + */ +template<bool Debug> +struct UnitState { + struct Registers { + // The registers are accessed by the shader JIT using SSE instructions, and are therefore + // required to be 16-byte aligned. + Math::Vec4<float24> MEMORY_ALIGNED16(input[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(output[16]); + Math::Vec4<float24> MEMORY_ALIGNED16(temporary[16]); + } registers; + static_assert(std::is_pod<Registers>::value, "Structure is not POD"); + + u32 program_counter; + bool conditional_code[2]; + + // Two Address registers and one loop counter + // TODO: How many bits do these actually have? + s32 address_registers[3]; + + enum { + INVALID_ADDRESS = 0xFFFFFFFF + }; + + struct CallStackElement { + u32 final_address; // Address upon which we jump to return_address + u32 return_address; // Where to jump when leaving scope + u8 repeat_counter; // How often to repeat until this call stack element is removed + u8 loop_increment; // Which value to add to the loop counter after an iteration + // TODO: Should this be a signed value? Does it even matter? + u32 loop_address; // The address where we'll return to after each loop iteration + }; + + // TODO: Is there a maximal size for this? + boost::container::static_vector<CallStackElement, 16> call_stack; + + DebugData<Debug> debug; + + static int InputOffset(const SourceRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Input: + return (int)offsetof(UnitState::Registers, input) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } + + static int OutputOffset(const DestRegister& reg) { + switch (reg.GetRegisterType()) { + case RegisterType::Output: + return (int)offsetof(UnitState::Registers, output) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + case RegisterType::Temporary: + return (int)offsetof(UnitState::Registers, temporary) + reg.GetIndex()*sizeof(Math::Vec4<float24>); + + default: + UNREACHABLE(); + return 0; + } + } +}; + +/** + * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per + * vertex, which would happen within the `Run` function). + * @param state Shader unit state, must be setup per shader and per shader unit + */ +void Setup(UnitState<false>& state); + +/// Performs any cleanup when the emulator is shutdown +void Shutdown(); + +/** + * Runs the currently setup shader + * @param state Shader unit state, must be setup per shader and per shader unit + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @return The output vertex, after having been processed by the vertex shader + */ +OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes); + +/** + * Produce debug information based on the given shader and input vertex + * @param input Input vertex into the shader + * @param num_attributes The number of vertex shader attributes + * @param config Configuration object for the shader pipeline + * @param setup Setup object for the shader pipeline + * @return Debug information for this shader with regards to the given vertex + */ +DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/shader/shader_interpreter.cpp index e73a1d365..e14de0768 100644 --- a/src/video_core/vertex_shader.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -2,18 +2,14 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <boost/container/static_vector.hpp> -#include <boost/range/algorithm.hpp> - #include <common/file_util.h> #include <nihstro/shader_bytecode.h> -#include "common/profiler.h" +#include "video_core/pica.h" -#include "pica.h" -#include "vertex_shader.h" -#include "debug_utils/debug_utils.h" +#include "shader.h" +#include "shader_interpreter.h" using nihstro::OpCode; using nihstro::Instruction; @@ -23,44 +19,10 @@ using nihstro::SwizzlePattern; namespace Pica { -namespace VertexShader { - -struct VertexShaderState { - u32 program_counter; - - const float24* input_register_table[16]; - Math::Vec4<float24> output_registers[16]; - - Math::Vec4<float24> temporary_registers[16]; - bool conditional_code[2]; - - // Two Address registers and one loop counter - // TODO: How many bits do these actually have? - s32 address_registers[3]; - - enum { - INVALID_ADDRESS = 0xFFFFFFFF - }; - - struct CallStackElement { - u32 final_address; // Address upon which we jump to return_address - u32 return_address; // Where to jump when leaving scope - u8 repeat_counter; // How often to repeat until this call stack element is removed - u8 loop_increment; // Which value to add to the loop counter after an iteration - // TODO: Should this be a signed value? Does it even matter? - u32 loop_address; // The address where we'll return to after each loop iteration - }; +namespace Shader { - // TODO: Is there a maximal size for this? - boost::container::static_vector<CallStackElement, 16> call_stack; - - struct { - u32 max_offset; // maximum program counter ever reached - u32 max_opdesc_id; // maximum swizzle pattern index ever used - } debug; -}; - -static void ProcessShaderCode(VertexShaderState& state) { +template<bool Debug> +void RunInterpreter(UnitState<Debug>& state) { const auto& uniforms = g_state.vs.uniforms; const auto& swizzle_data = g_state.vs.swizzle_data; const auto& program_code = g_state.vs.program_code; @@ -68,7 +30,9 @@ static void ProcessShaderCode(VertexShaderState& state) { // Placeholder for invalid inputs static float24 dummy_vec4_float24[4]; - while (true) { + unsigned iteration = 0; + bool exit_loop = false; + while (!exit_loop) { if (!state.call_stack.empty()) { auto& top = state.call_stack.back(); if (state.program_counter == top.final_address) { @@ -86,25 +50,28 @@ static void ProcessShaderCode(VertexShaderState& state) { } } - bool exit_loop = false; const Instruction instr = { program_code[state.program_counter] }; const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] }; - static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions, + static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions, u32 return_offset, u8 repeat_count, u8 loop_increment) { state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset ASSERT(state.call_stack.size() < state.call_stack.capacity()); state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset }); }; + Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter); + if (iteration > 0) + Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter); + state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter); auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { switch (source_reg.GetRegisterType()) { case RegisterType::Input: - return state.input_register_table[source_reg.GetIndex()]; + return &state.registers.input[source_reg.GetIndex()].x; case RegisterType::Temporary: - return &state.temporary_registers[source_reg.GetIndex()].x; + return &state.registers.temporary[source_reg.GetIndex()].x; case RegisterType::FloatUniform: return &uniforms.f[source_reg.GetIndex()].x; @@ -153,8 +120,8 @@ static void ProcessShaderCode(VertexShaderState& state) { src2[3] = src2[3] * float24::FromFloat32(-1); } - float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0] - : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] + float24* dest = (instr.common.dest.Value() < 0x10) ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] + : (instr.common.dest.Value() < 0x20) ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] : dummy_vec4_float24; state.debug.max_opdesc_id = std::max<u32>(state.debug.max_opdesc_id, 1+instr.common.operand_desc_id); @@ -162,58 +129,78 @@ static void ProcessShaderCode(VertexShaderState& state) { switch (instr.opcode.Value().EffectiveOpCode()) { case OpCode::Id::ADD: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] + src2[i]; } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::MUL: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] * src2[i]; } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::FLR: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::MAX: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = std::max(src1[i], src2[i]); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::MIN: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = std::min(src1[i], src2[i]); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::DP3: case OpCode::Id::DP4: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); float24 dot = float24::FromFloat32(0.f); int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4; for (int i = 0; i < num_components; ++i) @@ -225,12 +212,15 @@ static void ProcessShaderCode(VertexShaderState& state) { dest[i] = dot; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } // Reciprocal case OpCode::Id::RCP: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -239,13 +229,15 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: I think this might be wrong... we should only use one component here dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32()); } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } // Reciprocal Square Root case OpCode::Id::RSQ: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -254,12 +246,13 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: I think this might be wrong... we should only use one component here dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32())); } - + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::MOVA: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); for (int i = 0; i < 2; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; @@ -267,32 +260,41 @@ static void ProcessShaderCode(VertexShaderState& state) { // TODO: Figure out how the rounding is done on hardware state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32()); } - + Record<DebugDataRecord::ADDR_REG_OUT>(state.debug, iteration, state.address_registers); break; } case OpCode::Id::MOV: { + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i]; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; } case OpCode::Id::SLT: case OpCode::Id::SLTI: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); break; case OpCode::Id::CMP: + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); for (int i = 0; i < 2; ++i) { // TODO: Can you restrict to one compare via dest masking? @@ -300,27 +302,27 @@ static void ProcessShaderCode(VertexShaderState& state) { auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value(); switch (op) { - case compare_op.Equal: + case Instruction::Common::CompareOpType::Equal: state.conditional_code[i] = (src1[i] == src2[i]); break; - case compare_op.NotEqual: + case Instruction::Common::CompareOpType::NotEqual: state.conditional_code[i] = (src1[i] != src2[i]); break; - case compare_op.LessThan: + case Instruction::Common::CompareOpType::LessThan: state.conditional_code[i] = (src1[i] < src2[i]); break; - case compare_op.LessEqual: + case Instruction::Common::CompareOpType::LessEqual: state.conditional_code[i] = (src1[i] <= src2[i]); break; - case compare_op.GreaterThan: + case Instruction::Common::CompareOpType::GreaterThan: state.conditional_code[i] = (src1[i] > src2[i]); break; - case compare_op.GreaterEqual: + case Instruction::Common::CompareOpType::GreaterEqual: state.conditional_code[i] = (src1[i] >= src2[i]); break; @@ -329,6 +331,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; } } + Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code); break; default: @@ -394,16 +397,21 @@ static void ProcessShaderCode(VertexShaderState& state) { src3[3] = src3[3] * float24::FromFloat32(-1); } - float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0] - : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] + float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] + : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] : dummy_vec4_float24; + Record<DebugDataRecord::SRC1>(state.debug, iteration, src1); + Record<DebugDataRecord::SRC2>(state.debug, iteration, src2); + Record<DebugDataRecord::SRC3>(state.debug, iteration, src3); + Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest); for (int i = 0; i < 4; ++i) { if (!swizzle.DestComponentEnabled(i)) continue; dest[i] = src1[i] * src2[i] + src3[i]; } + Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); } else { LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x", (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex); @@ -413,7 +421,7 @@ static void ProcessShaderCode(VertexShaderState& state) { default: { - static auto evaluate_condition = [](const VertexShaderState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { + static auto evaluate_condition = [](const UnitState<Debug>& state, bool refx, bool refy, Instruction::FlowControlType flow_control) { bool results[2] = { refx == state.conditional_code[0], refy == state.conditional_code[1] }; @@ -439,12 +447,14 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::JMPC: + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { state.program_counter = instr.flow_control.dest_offset - 1; } break; case OpCode::Id::JMPU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { state.program_counter = instr.flow_control.dest_offset - 1; } @@ -458,6 +468,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::CALLU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, instr.flow_control.dest_offset, @@ -467,6 +478,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::CALLC: + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, instr.flow_control.dest_offset, @@ -479,6 +491,7 @@ static void ProcessShaderCode(VertexShaderState& state) { break; case OpCode::Id::IFU: + Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); if (uniforms.b[instr.flow_control.bool_uniform_id]) { call(state, state.program_counter + 1, @@ -497,6 +510,7 @@ static void ProcessShaderCode(VertexShaderState& state) { { // TODO: Do we need to consider swizzlers here? + Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code); if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) { call(state, state.program_counter + 1, @@ -514,14 +528,19 @@ static void ProcessShaderCode(VertexShaderState& state) { case OpCode::Id::LOOP: { - state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y; + Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x, + uniforms.i[instr.flow_control.int_uniform_id].y, + uniforms.i[instr.flow_control.int_uniform_id].z, + uniforms.i[instr.flow_control.int_uniform_id].w); + state.address_registers[2] = loop_param.y; + Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param); call(state, state.program_counter + 1, instr.flow_control.dest_offset - state.program_counter + 1, instr.flow_control.dest_offset + 1, - uniforms.i[instr.flow_control.int_uniform_id].x, - uniforms.i[instr.flow_control.int_uniform_id].z); + loop_param.x, + loop_param.z); break; } @@ -536,85 +555,13 @@ static void ProcessShaderCode(VertexShaderState& state) { } ++state.program_counter; - - if (exit_loop) - break; + ++iteration; } } -static Common::Profiling::TimingCategory shader_category("Vertex Shader"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) { - Common::Profiling::ScopeTimer timer(shader_category); - - VertexShaderState state; - - state.program_counter = config.main_offset; - state.debug.max_offset = 0; - state.debug.max_opdesc_id = 0; - - // Setup input register table - const auto& attribute_register_map = config.input_register_map; - float24 dummy_register; - boost::fill(state.input_register_table, &dummy_register); - - if (num_attributes > 0) state.input_register_table[attribute_register_map.attribute0_register] = &input.attr[0].x; - if (num_attributes > 1) state.input_register_table[attribute_register_map.attribute1_register] = &input.attr[1].x; - if (num_attributes > 2) state.input_register_table[attribute_register_map.attribute2_register] = &input.attr[2].x; - if (num_attributes > 3) state.input_register_table[attribute_register_map.attribute3_register] = &input.attr[3].x; - if (num_attributes > 4) state.input_register_table[attribute_register_map.attribute4_register] = &input.attr[4].x; - if (num_attributes > 5) state.input_register_table[attribute_register_map.attribute5_register] = &input.attr[5].x; - if (num_attributes > 6) state.input_register_table[attribute_register_map.attribute6_register] = &input.attr[6].x; - if (num_attributes > 7) state.input_register_table[attribute_register_map.attribute7_register] = &input.attr[7].x; - if (num_attributes > 8) state.input_register_table[attribute_register_map.attribute8_register] = &input.attr[8].x; - if (num_attributes > 9) state.input_register_table[attribute_register_map.attribute9_register] = &input.attr[9].x; - if (num_attributes > 10) state.input_register_table[attribute_register_map.attribute10_register] = &input.attr[10].x; - if (num_attributes > 11) state.input_register_table[attribute_register_map.attribute11_register] = &input.attr[11].x; - if (num_attributes > 12) state.input_register_table[attribute_register_map.attribute12_register] = &input.attr[12].x; - if (num_attributes > 13) state.input_register_table[attribute_register_map.attribute13_register] = &input.attr[13].x; - if (num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; - if (num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; - - state.conditional_code[0] = false; - state.conditional_code[1] = false; - - ProcessShaderCode(state); - DebugUtils::DumpShader(setup.program_code.data(), state.debug.max_offset, setup.swizzle_data.data(), - state.debug.max_opdesc_id, config.main_offset, - g_state.regs.vs_output_attributes); // TODO: Don't hardcode VS here - - // Setup output data - OutputVertex ret; - // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to - // figure out what those circumstances are and enable the remaining outputs then. - for (int i = 0; i < 7; ++i) { - const auto& output_register_map = g_state.regs.vs_output_attributes[i]; // TODO: Don't hardcode VS here - - u32 semantics[4] = { - output_register_map.map_x, output_register_map.map_y, - output_register_map.map_z, output_register_map.map_w - }; - - for (int comp = 0; comp < 4; ++comp) { - float24* out = ((float24*)&ret) + semantics[comp]; - if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { - *out = state.output_registers[i][comp]; - } else { - // Zero output so that attributes which aren't output won't have denormals in them, - // which would slow us down later. - memset(out, 0, sizeof(*out)); - } - } - } - - LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", - ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), - ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), - ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32()); - - return ret; -} - +// Explicit instantiation +template void RunInterpreter(UnitState<false>& state); +template void RunInterpreter(UnitState<true>& state); } // namespace diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h new file mode 100644 index 000000000..71bcad5ac --- /dev/null +++ b/src/video_core/shader/shader_interpreter.h @@ -0,0 +1,20 @@ +// Copyright 2014 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/pica.h" + +#include "shader.h" + +namespace Pica { + +namespace Shader { + +template<bool Debug> +void RunInterpreter(UnitState<Debug>& state); + +} // namespace + +} // namespace diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp new file mode 100644 index 000000000..836942c6b --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -0,0 +1,675 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <smmintrin.h> + +#include "common/x64/abi.h" +#include "common/x64/cpu_detect.h" +#include "common/x64/emitter.h" + +#include "shader.h" +#include "shader_jit_x64.h" + +namespace Pica { + +namespace Shader { + +using namespace Gen; + +typedef void (JitCompiler::*JitFunction)(Instruction instr); + +const JitFunction instr_table[64] = { + &JitCompiler::Compile_ADD, // add + &JitCompiler::Compile_DP3, // dp3 + &JitCompiler::Compile_DP4, // dp4 + nullptr, // dph + nullptr, // unknown + nullptr, // ex2 + nullptr, // lg2 + nullptr, // unknown + &JitCompiler::Compile_MUL, // mul + nullptr, // lge + nullptr, // slt + &JitCompiler::Compile_FLR, // flr + &JitCompiler::Compile_MAX, // max + &JitCompiler::Compile_MIN, // min + &JitCompiler::Compile_RCP, // rcp + &JitCompiler::Compile_RSQ, // rsq + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_MOVA, // mova + &JitCompiler::Compile_MOV, // mov + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // dphi + nullptr, // unknown + nullptr, // sgei + &JitCompiler::Compile_SLTI, // slti + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + nullptr, // unknown + &JitCompiler::Compile_NOP, // nop + &JitCompiler::Compile_END, // end + nullptr, // break + &JitCompiler::Compile_CALL, // call + &JitCompiler::Compile_CALLC, // callc + &JitCompiler::Compile_CALLU, // callu + &JitCompiler::Compile_IF, // ifu + &JitCompiler::Compile_IF, // ifc + &JitCompiler::Compile_LOOP, // loop + nullptr, // emit + nullptr, // sete + &JitCompiler::Compile_JMP, // jmpc + &JitCompiler::Compile_JMP, // jmpu + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_CMP, // cmp + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // madi + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad + &JitCompiler::Compile_MAD, // mad +}; + +// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can +// be used as scratch registers within a compiler function. The other registers have designated +// purposes, as documented below: + +/// Pointer to the uniform memory +static const X64Reg UNIFORMS = R9; +/// The two 32-bit VS address offset registers set by the MOVA instruction +static const X64Reg ADDROFFS_REG_0 = R10; +static const X64Reg ADDROFFS_REG_1 = R11; +/// VS loop count register +static const X64Reg LOOPCOUNT_REG = R12; +/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) +static const X64Reg LOOPCOUNT = RSI; +/// Number to increment LOOPCOUNT_REG by on each loop iteration +static const X64Reg LOOPINC = RDI; +/// Result of the previous CMP instruction for the X-component comparison +static const X64Reg COND0 = R13; +/// Result of the previous CMP instruction for the Y-component comparison +static const X64Reg COND1 = R14; +/// Pointer to the UnitState instance for the current VS unit +static const X64Reg REGISTERS = R15; +/// SIMD scratch register +static const X64Reg SCRATCH = XMM0; +/// Loaded with the first swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC1 = XMM1; +/// Loaded with the second swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC2 = XMM2; +/// Loaded with the third swizzled source register, otherwise can be used as a scratch register +static const X64Reg SRC3 = XMM3; +/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one +static const X64Reg ONE = XMM14; +/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR +static const X64Reg NEGBIT = XMM15; + +/// Raw constant for the source register selector that indicates no swizzling is performed +static const u8 NO_SRC_REG_SWIZZLE = 0x1b; +/// Raw constant for the destination register enable mask that indicates all components are enabled +static const u8 NO_DEST_REG_MASK = 0xf; + +/** + * Loads and swizzles a source register into the specified XMM register. + * @param instr VS instruction, used for determining how to load the source register + * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) + * @param src_reg SourceRegister object corresponding to the source register to load + * @param dest Destination XMM register to store the loaded, swizzled source register + */ +void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { + X64Reg src_ptr; + int src_offset; + + if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { + src_ptr = UNIFORMS; + src_offset = src_reg.GetIndex() * sizeof(float24) * 4; + } else { + src_ptr = REGISTERS; + src_offset = UnitState<false>::InputOffset(src_reg); + } + + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + // The MAD and MADI instructions do not use the address offset registers, so loading the + // source is a bit simpler here + + operand_desc_id = instr.mad.operand_desc_id; + + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } else { + operand_desc_id = instr.common.operand_desc_id; + + const bool is_inverted = (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); + unsigned offset_src = is_inverted ? 2 : 1; + + if (src_num == offset_src && instr.common.address_register_index != 0) { + switch (instr.common.address_register_index) { + case 1: // address offset 1 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, 1, src_offset)); + break; + case 2: // address offset 2 + MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, 1, src_offset)); + break; + case 3: // adddress offet 3 + MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, 1, src_offset)); + break; + default: + UNREACHABLE(); + break; + } + } else { + // Load the source + MOVAPS(dest, MDisp(src_ptr, src_offset)); + } + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // Generate instructions for source register swizzling as needed + u8 sel = swiz.GetRawSelector(src_num); + if (sel != NO_SRC_REG_SWIZZLE) { + // Selector component order needs to be reversed for the SHUFPS instruction + sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); + + // Shuffle inputs for swizzle + SHUFPS(dest, R(dest), sel); + } + + // If the source register should be negated, flip the negative bit using XOR + const bool negate[] = { swiz.negate_src1, swiz.negate_src2, swiz.negate_src3 }; + if (negate[src_num - 1]) { + XORPS(dest, R(NEGBIT)); + } +} + +void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { + DestRegister dest; + unsigned operand_desc_id; + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || + instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + operand_desc_id = instr.mad.operand_desc_id; + dest = instr.mad.dest.Value(); + } else { + operand_desc_id = instr.common.operand_desc_id; + dest = instr.common.dest.Value(); + } + + SwizzlePattern swiz = { g_state.vs.swizzle_data[operand_desc_id] }; + + // If all components are enabled, write the result to the destination register + if (swiz.dest_mask == NO_DEST_REG_MASK) { + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), src); + + } else { + // Not all components are enabled, so mask the result when storing to the destination register... + MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState<false>::OutputOffset(dest))); + + if (Common::GetCPUCaps().sse4_1) { + u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); + BLENDPS(SCRATCH, R(src), mask); + } else { + MOVAPS(XMM4, R(src)); + UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination + UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination + + // Compute selector to selectively copy source components to destination for SHUFPS instruction + u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | + ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | + ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | + ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); + SHUFPS(SCRATCH, R(XMM4), sel); + } + + // Store dest back to memory + MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), SCRATCH); + } +} + +void JitCompiler::Compile_EvaluateCondition(Instruction instr) { + // Note: NXOR is used below to check for equality + switch (instr.flow_control.op) { + case Instruction::FlowControlType::Or: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + OR(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::And: + MOV(32, R(RAX), R(COND0)); + MOV(32, R(RBX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); + AND(32, R(RAX), R(RBX)); + break; + + case Instruction::FlowControlType::JustX: + MOV(32, R(RAX), R(COND0)); + XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); + break; + + case Instruction::FlowControlType::JustY: + MOV(32, R(RAX), R(COND1)); + XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); + break; + } +} + +void JitCompiler::Compile_UniformCondition(Instruction instr) { + int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); + CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); +} + +void JitCompiler::Compile_ADD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + ADDPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP3(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0x7f); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); + + MOVAPS(SRC3, R(SRC1)); + SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); + + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); + ADDPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_DP4(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + if (Common::GetCPUCaps().sse4_1) { + DPPS(SRC1, R(SRC2), 0xff); + } else { + MULPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY + ADDPS(SRC1, R(SRC2)); + + MOVAPS(SRC2, R(SRC1)); + SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX + ADDPS(SRC1, R(SRC2)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MUL(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MULPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_FLR(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + if (Common::GetCPUCaps().sse4_1) { + ROUNDFLOORPS(SRC1, R(SRC1)); + } else { + CVTPS2DQ(SRC1, R(SRC1)); + CVTDQ2PS(SRC1, R(SRC1)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MAX(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MAXPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MIN(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + MINPS(SRC1, R(SRC2)); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_MOVA(Instruction instr) { + SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; + + if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { + return; // NoOp + } + + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // Convert floats to integers (only care about X and Y components) + CVTPS2DQ(SRC1, R(SRC1)); + + // Get result + MOVQ_xmm(R(RAX), SRC1); + + // Handle destination enable + if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } else { + if (swiz.DestComponentEnabled(0)) { + // Move and sign-extend low 32 bits + MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_0), Imm8(4)); + } else if (swiz.DestComponentEnabled(1)) { + // Move and sign-extend high 32 bits + SHR(64, R(RAX), Imm8(32)); + MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); + + // Multiply by 16 to be used as an offset later + SHL(64, R(ADDROFFS_REG_1), Imm8(4)); + } + } +} + +void JitCompiler::Compile_MOV(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_SLTI(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); + Compile_SwizzleSrc(instr, 1, instr.common.src2i, SRC2); + + CMPSS(SRC1, R(SRC2), CMP_LT); + ANDPS(SRC1, R(ONE)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RCP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RCPPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RCPPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_RSQ(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + + // TODO(bunnei): RSQRTPS is a pretty rough approximation, this might cause problems if Pica + // performs this operation more accurately. This should be checked on hardware. + RSQRTPS(SRC1, R(SRC1)); + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_NOP(Instruction instr) { +} + +void JitCompiler::Compile_END(Instruction instr) { + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); +} + +void JitCompiler::Compile_CALL(Instruction instr) { + unsigned offset = instr.flow_control.dest_offset; + while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { + Compile_NextInstr(&offset); + } +} + +void JitCompiler::Compile_CALLC(Instruction instr) { + Compile_EvaluateCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CALLU(Instruction instr) { + Compile_UniformCondition(instr); + FixupBranch b = J_CC(CC_Z, true); + Compile_CALL(instr); + SetJumpTarget(b); +} + +void JitCompiler::Compile_CMP(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); + Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); + + static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; + + if (instr.common.compare_op.x == instr.common.compare_op.y) { + // Compare X-component and Y-component together + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); + + MOVQ_xmm(R(COND0), SRC1); + MOV(64, R(COND1), R(COND0)); + } else { + // Compare X-component + MOVAPS(SCRATCH, R(SRC1)); + CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); + + // Compare Y-component + CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); + + MOVQ_xmm(R(COND0), SCRATCH); + MOVQ_xmm(R(COND1), SRC1); + } + + SHR(32, R(COND0), Imm8(31)); + SHR(64, R(COND1), Imm8(63)); +} + +void JitCompiler::Compile_MAD(Instruction instr) { + Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); + + if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { + Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); + } else { + Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); + Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); + } + + if (Common::GetCPUCaps().fma) { + VFMADD213PS(SRC1, SRC2, R(SRC3)); + } else { + MULPS(SRC1, R(SRC2)); + ADDPS(SRC1, R(SRC3)); + } + + Compile_DestEnable(instr, SRC1); +} + +void JitCompiler::Compile_IF(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements not supported"); + + // Evaluate the "IF" condition + if (instr.opcode.Value() == OpCode::Id::IFU) { + Compile_UniformCondition(instr); + } else if (instr.opcode.Value() == OpCode::Id::IFC) { + Compile_EvaluateCondition(instr); + } + FixupBranch b = J_CC(CC_Z, true); + + // Compile the code that corresponds to the condition evaluating as true + Compile_Block(instr.flow_control.dest_offset - 1); + + // If there isn't an "ELSE" condition, we are done here + if (instr.flow_control.num_instructions == 0) { + SetJumpTarget(b); + return; + } + + FixupBranch b2 = J(true); + + SetJumpTarget(b); + + // This code corresponds to the "ELSE" condition + // Comple the code that corresponds to the condition evaluating as false + Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions - 1); + + SetJumpTarget(b2); +} + +void JitCompiler::Compile_LOOP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops not supported"); + ASSERT_MSG(!looping, "Nested loops not supported"); + + looping = true; + + int offset = offsetof(decltype(g_state.vs.uniforms), i) + (instr.flow_control.int_uniform_id * sizeof(Math::Vec4<u8>)); + MOV(32, R(LOOPCOUNT), MDisp(UNIFORMS, offset)); + MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); + SHR(32, R(LOOPCOUNT_REG), Imm8(8)); + AND(32, R(LOOPCOUNT_REG), Imm32(0xff)); // Y-component is the start + MOV(32, R(LOOPINC), R(LOOPCOUNT)); + SHR(32, R(LOOPINC), Imm8(16)); + MOVZX(32, 8, LOOPINC, R(LOOPINC)); // Z-component is the incrementer + MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count + ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 + + auto loop_start = GetCodePtr(); + + Compile_Block(instr.flow_control.dest_offset); + + ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component + SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 + J_CC(CC_NZ, loop_start); // Loop if not equal + + looping = false; +} + +void JitCompiler::Compile_JMP(Instruction instr) { + ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps not supported"); + + if (instr.opcode.Value() == OpCode::Id::JMPC) + Compile_EvaluateCondition(instr); + else if (instr.opcode.Value() == OpCode::Id::JMPU) + Compile_UniformCondition(instr); + else + UNREACHABLE(); + + FixupBranch b = J_CC(CC_NZ, true); + + Compile_Block(instr.flow_control.dest_offset); + + SetJumpTarget(b); +} + +void JitCompiler::Compile_Block(unsigned stop) { + // Save current offset pointer + unsigned* prev_offset_ptr = offset_ptr; + unsigned offset = *prev_offset_ptr; + + while (offset <= stop) + Compile_NextInstr(&offset); + + // Restore current offset pointer + offset_ptr = prev_offset_ptr; + *offset_ptr = offset; +} + +void JitCompiler::Compile_NextInstr(unsigned* offset) { + offset_ptr = offset; + + Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; + OpCode::Id opcode = instr.opcode.Value(); + auto instr_func = instr_table[static_cast<unsigned>(opcode)]; + + if (instr_func) { + // JIT the instruction! + ((*this).*instr_func)(instr); + } else { + // Unhandled instruction + LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", instr.opcode.Value(), instr.hex); + } +} + +CompiledShader* JitCompiler::Compile() { + const u8* start = GetCodePtr(); + const auto& code = g_state.vs.program_code; + unsigned offset = g_state.regs.vs.main_offset; + + ABI_PushAllCalleeSavedRegsAndAdjustStack(); + + MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); + MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); + + // Zero address/loop registers + XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); + XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); + XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); + + // Used to set a register to one + static const __m128 one = { 1.f, 1.f, 1.f, 1.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&one)); + MOVAPS(ONE, MDisp(RAX, 0)); + + // Used to negate registers + static const __m128 neg = { -0.f, -0.f, -0.f, -0.f }; + MOV(PTRBITS, R(RAX), ImmPtr(&neg)); + MOVAPS(NEGBIT, MDisp(RAX, 0)); + + looping = false; + + while (offset < g_state.vs.program_code.size()) { + Compile_NextInstr(&offset); + } + + return (CompiledShader*)start; +} + +JitCompiler::JitCompiler() { + AllocCodeSpace(1024 * 1024 * 4); +} + +void JitCompiler::Clear() { + ClearCodeSpace(); +} + +} // namespace Shader + +} // namespace Pica diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h new file mode 100644 index 000000000..b88f2a0d2 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64.h @@ -0,0 +1,79 @@ +// Copyright 2015 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <nihstro/shader_bytecode.h> + +#include "common/x64/emitter.h" + +#include "video_core/pica.h" + +#include "shader.h" + +using nihstro::Instruction; +using nihstro::OpCode; +using nihstro::SwizzlePattern; + +namespace Pica { + +namespace Shader { + +using CompiledShader = void(void* registers); + +/** + * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 + * code that can be executed on the host machine directly. + */ +class JitCompiler : public Gen::XCodeBlock { +public: + JitCompiler(); + + CompiledShader* Compile(); + + void Clear(); + + void Compile_ADD(Instruction instr); + void Compile_DP3(Instruction instr); + void Compile_DP4(Instruction instr); + void Compile_MUL(Instruction instr); + void Compile_FLR(Instruction instr); + void Compile_MAX(Instruction instr); + void Compile_MIN(Instruction instr); + void Compile_RCP(Instruction instr); + void Compile_RSQ(Instruction instr); + void Compile_MOVA(Instruction instr); + void Compile_MOV(Instruction instr); + void Compile_SLTI(Instruction instr); + void Compile_NOP(Instruction instr); + void Compile_END(Instruction instr); + void Compile_CALL(Instruction instr); + void Compile_CALLC(Instruction instr); + void Compile_CALLU(Instruction instr); + void Compile_IF(Instruction instr); + void Compile_LOOP(Instruction instr); + void Compile_JMP(Instruction instr); + void Compile_CMP(Instruction instr); + void Compile_MAD(Instruction instr); + +private: + void Compile_Block(unsigned stop); + void Compile_NextInstr(unsigned* offset); + + void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); + void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); + + void Compile_EvaluateCondition(Instruction instr); + void Compile_UniformCondition(Instruction instr); + + /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. + unsigned* offset_ptr = nullptr; + + /// Set to true if currently in a loop, used to check for the existence of nested loops + bool looping = false; +}; + +} // Shader + +} // Pica diff --git a/src/video_core/vertex_shader.h b/src/video_core/vertex_shader.h deleted file mode 100644 index 97f9250dd..000000000 --- a/src/video_core/vertex_shader.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> - -#include "common/vector_math.h" - -#include "pica.h" - -namespace Pica { - -namespace VertexShader { - -struct InputVertex { - Math::Vec4<float24> attr[16]; -}; - -struct OutputVertex { - OutputVertex() = default; - - // VS output attributes - Math::Vec4<float24> pos; - Math::Vec4<float24> dummy; // quaternions (not implemented, yet) - Math::Vec4<float24> color; - Math::Vec2<float24> tc0; - Math::Vec2<float24> tc1; - float24 pad[6]; - Math::Vec2<float24> tc2; - - // Padding for optimal alignment - float24 pad2[4]; - - // Attributes used to store intermediate results - - // position after perspective divide - Math::Vec3<float24> screenpos; - float24 pad3; - - // Linear interpolation - // factor: 0=this, 1=vtx - void Lerp(float24 factor, const OutputVertex& vtx) { - pos = pos * factor + vtx.pos * (float24::FromFloat32(1) - factor); - - // TODO: Should perform perspective correct interpolation here... - tc0 = tc0 * factor + vtx.tc0 * (float24::FromFloat32(1) - factor); - tc1 = tc1 * factor + vtx.tc1 * (float24::FromFloat32(1) - factor); - tc2 = tc2 * factor + vtx.tc2 * (float24::FromFloat32(1) - factor); - - screenpos = screenpos * factor + vtx.screenpos * (float24::FromFloat32(1) - factor); - - color = color * factor + vtx.color * (float24::FromFloat32(1) - factor); - } - - // Linear interpolation - // factor: 0=v0, 1=v1 - static OutputVertex Lerp(float24 factor, const OutputVertex& v0, const OutputVertex& v1) { - OutputVertex ret = v0; - ret.Lerp(factor, v1); - return ret; - } -}; -static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); -static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); - -OutputVertex RunShader(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup); - -} // namespace - -} // namespace - diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 3becc4261..943fde5ee 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -23,6 +23,7 @@ EmuWindow* g_emu_window = nullptr; ///< Frontend emulator window RendererBase* g_renderer = nullptr; ///< Renderer plugin std::atomic<bool> g_hw_renderer_enabled; +std::atomic<bool> g_shader_jit_enabled; /// Initialize the video core void Init(EmuWindow* emu_window) { diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h index 14b33c9dd..2867bf03e 100644 --- a/src/video_core/video_core.h +++ b/src/video_core/video_core.h @@ -32,8 +32,9 @@ static const int kScreenBottomHeight = 240; ///< 3DS bottom screen height extern RendererBase* g_renderer; ///< Renderer plugin extern EmuWindow* g_emu_window; ///< Emu window -// TODO: Wrap this in a user settings struct along with any other graphics settings (often set from qt ui) +// TODO: Wrap these in a user settings struct along with any other graphics settings (often set from qt ui) extern std::atomic<bool> g_hw_renderer_enabled; +extern std::atomic<bool> g_shader_jit_enabled; /// Start the video core void Start(); |
