From 2985e5e94c82febcf215feb0023f4184b38bb24a Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 13 Feb 2021 15:50:12 -0500 Subject: renderer_opengl: Accelerate ASTC texture decoding with a compute shader ASTC texture decoding is currently handled by a CPU decoder for GPU's without native ASTC decoding support (most desktop GPUs). This is the cause for noticeable performance degradation in titles which use the format extensively. This commit adds support to accelerate ASTC decoding using a compute shader on OpenGL for GPUs without native support. --- src/video_core/renderer_opengl/util_shaders.cpp | 99 ++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) (limited to 'src/video_core/renderer_opengl/util_shaders.cpp') diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 2fe4799bc..2a4220661 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -3,7 +3,10 @@ // Refer to the license.txt file included. #include +#include #include +#include +#include #include #include @@ -24,11 +27,13 @@ #include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/texture_cache/types.h" #include "video_core/texture_cache/util.h" +#include "video_core/textures/astc.h" #include "video_core/textures/decoders.h" namespace OpenGL { using namespace HostShaders; +using namespace Tegra::Texture::ASTC; using VideoCommon::Extent3D; using VideoCommon::ImageCopy; @@ -63,12 +68,104 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { + // TODO: Load shader string as a header + std::string astc_path = "astc_decoder.comp"; + std::ifstream t(astc_path); + std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); + astc_decoder_program = MakeProgram(str); + MakeBuffers(); +} + +UtilShaders::~UtilShaders() = default; + +void UtilShaders::MakeBuffers() { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); + + astc_encodings_buffer.Create(); + glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues, + 0); + replicate_6_to_8_buffer.Create(); + glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE), + &REPLICATE_6_BIT_TO_8_TABLE, 0); + replicate_7_to_8_buffer.Create(); + glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE), + &REPLICATE_7_BIT_TO_8_TABLE, 0); + replicate_8_to_8_buffer.Create(); + glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE), + &REPLICATE_8_BIT_TO_8_TABLE, 0); + replicate_byte_to_16_buffer.Create(); + glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE), + &REPLICATE_BYTE_TO_16_TABLE, 0); } -UtilShaders::~UtilShaders() = default; +void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, + std::span swizzles) { + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_ENC_BUFFER = 2; + + static constexpr GLuint BINDING_6_TO_8_BUFFER = 3; + static constexpr GLuint BINDING_7_TO_8_BUFFER = 4; + static constexpr GLuint BINDING_8_TO_8_BUFFER = 5; + static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; + + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; + static constexpr GLuint LOC_BLOCK_DIMS = 1; + static constexpr GLuint LOC_LAYER = 2; + + const Extent3D tile_size = { + VideoCore::Surface::DefaultBlockWidth(image.info.format), + VideoCore::Surface::DefaultBlockHeight(image.info.format), + }; + program_manager.BindHostCompute(astc_decoder_program.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, + replicate_6_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, + replicate_7_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, + replicate_8_to_8_buffer.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, + replicate_byte_to_16_buffer.handle); + + glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); + glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + + for (u32 layer = 0; layer < image.info.resources.layers; layer++) { + for (const SwizzleParameters& swizzle : swizzles) { + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE, + layer, GL_WRITE_ONLY, GL_RGBA8); + const size_t input_offset = swizzle.buffer_offset + map.offset; + const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + + glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); + glUniform1ui(LOC_LAYER, layer); + + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(3, 1, params.origin.data()); + glUniform3iv(4, 1, params.destination.data()); + glUniform1ui(5, params.bytes_per_block_log2); + glUniform1ui(6, params.layer_stride); + glUniform1ui(7, params.block_size); + glUniform1ui(8, params.x_shift); + glUniform1ui(9, params.block_height); + glUniform1ui(10, params.block_height_mask); + + // ASTC texture data + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, + input_offset, image.guest_size_bytes - swizzle.buffer_offset); + + glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); + } + } + program_manager.RestoreGuestCompute(); +} void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, std::span swizzles) { -- cgit v1.2.3 From f6566338ebd6559b0fbe61e1557ee735bf58dcdd Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 13 Feb 2021 15:52:21 -0500 Subject: host_shaders: Modify shader cmake integration to allow for larger shaders using a raw string to encapsulate the entire shader code limits us to shaders of size less than 2KB. This change overcomes this limitation. --- src/video_core/renderer_opengl/util_shaders.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'src/video_core/renderer_opengl/util_shaders.cpp') diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 2a4220661..d0979dab1 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -14,6 +14,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/div_ceil.h" +#include "video_core/host_shaders/astc_decoder_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" #include "video_core/host_shaders/opengl_copy_bc4_comp.h" @@ -62,17 +63,12 @@ size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) { } // Anonymous namespace UtilShaders::UtilShaders(ProgramManager& program_manager_) - : program_manager{program_manager_}, + : program_manager{program_manager_}, astc_decoder_program(MakeProgram(ASTC_DECODER_COMP)), block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { - // TODO: Load shader string as a header - std::string astc_path = "astc_decoder.comp"; - std::ifstream t(astc_path); - std::string str((std::istreambuf_iterator(t)), std::istreambuf_iterator()); - astc_decoder_program = MakeProgram(str); MakeBuffers(); } -- cgit v1.2.3 From 2f30c105849c214345e2201f4bd6f9b4b76ab4a1 Mon Sep 17 00:00:00 2001 From: Rodrigo Locatti Date: Sat, 13 Feb 2021 16:08:50 -0500 Subject: astc_decoder: Reimplement Layers Reimplements the approach to decoding layers in the compute shader. Fixes multilayer astc decoding when using Vulkan. --- src/video_core/renderer_opengl/util_shaders.cpp | 53 +++++++++++-------------- 1 file changed, 24 insertions(+), 29 deletions(-) (limited to 'src/video_core/renderer_opengl/util_shaders.cpp') diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index d0979dab1..85722c54a 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -110,7 +110,6 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; static constexpr GLuint LOC_BLOCK_DIMS = 1; - static constexpr GLuint LOC_LAYER = 2; const Extent3D tile_size = { VideoCore::Surface::DefaultBlockWidth(image.info.format), @@ -130,35 +129,31 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + for (const SwizzleParameters& swizzle : swizzles) { + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, GL_RGBA8); + const size_t input_offset = swizzle.buffer_offset + map.offset; + const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - for (u32 layer = 0; layer < image.info.resources.layers; layer++) { - for (const SwizzleParameters& swizzle : swizzles) { - glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_FALSE, - layer, GL_WRITE_ONLY, GL_RGBA8); - const size_t input_offset = swizzle.buffer_offset + map.offset; - const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - - glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); - glUniform1ui(LOC_LAYER, layer); - - // To unswizzle the ASTC data - const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - glUniform3uiv(3, 1, params.origin.data()); - glUniform3iv(4, 1, params.destination.data()); - glUniform1ui(5, params.bytes_per_block_log2); - glUniform1ui(6, params.layer_stride); - glUniform1ui(7, params.block_size); - glUniform1ui(8, params.x_shift); - glUniform1ui(9, params.block_height); - glUniform1ui(10, params.block_height_mask); - - // ASTC texture data - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, - input_offset, image.guest_size_bytes - swizzle.buffer_offset); - - glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); - } + glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); + + // To unswizzle the ASTC data + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(2, 1, params.origin.data()); + glUniform3iv(3, 1, params.destination.data()); + glUniform1ui(4, params.bytes_per_block_log2); + glUniform1ui(5, params.layer_stride); + glUniform1ui(6, params.block_size); + glUniform1ui(7, params.x_shift); + glUniform1ui(8, params.block_height); + glUniform1ui(9, params.block_height_mask); + + // ASTC texture data + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); + + glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); } program_manager.RestoreGuestCompute(); } -- cgit v1.2.3 From 2f83d9a61bca42d9ef24074beb2b11b19bd4cecd Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 25 Mar 2021 16:53:51 -0400 Subject: astc_decoder: Refactor for style and more efficient memory use --- src/video_core/renderer_opengl/util_shaders.cpp | 96 ++++++++++--------------- 1 file changed, 39 insertions(+), 57 deletions(-) (limited to 'src/video_core/renderer_opengl/util_shaders.cpp') diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 85722c54a..47fddcb6e 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -2,11 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include -#include #include -#include -#include #include #include @@ -24,7 +20,6 @@ #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/util_shaders.h" -#include "video_core/surface.h" #include "video_core/texture_cache/accelerated_swizzle.h" #include "video_core/texture_cache/types.h" #include "video_core/texture_cache/util.h" @@ -36,6 +31,7 @@ namespace OpenGL { using namespace HostShaders; using namespace Tegra::Texture::ASTC; +using VideoCommon::Extent2D; using VideoCommon::Extent3D; using VideoCommon::ImageCopy; using VideoCommon::ImageType; @@ -69,33 +65,15 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { - MakeBuffers(); -} - -UtilShaders::~UtilShaders() = default; - -void UtilShaders::MakeBuffers() { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); + astc_buffer.Create(); glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); - - astc_encodings_buffer.Create(); - glNamedBufferStorage(astc_encodings_buffer.handle, sizeof(EncodingsValues), &EncodingsValues, - 0); - replicate_6_to_8_buffer.Create(); - glNamedBufferStorage(replicate_6_to_8_buffer.handle, sizeof(REPLICATE_6_BIT_TO_8_TABLE), - &REPLICATE_6_BIT_TO_8_TABLE, 0); - replicate_7_to_8_buffer.Create(); - glNamedBufferStorage(replicate_7_to_8_buffer.handle, sizeof(REPLICATE_7_BIT_TO_8_TABLE), - &REPLICATE_7_BIT_TO_8_TABLE, 0); - replicate_8_to_8_buffer.Create(); - glNamedBufferStorage(replicate_8_to_8_buffer.handle, sizeof(REPLICATE_8_BIT_TO_8_TABLE), - &REPLICATE_8_BIT_TO_8_TABLE, 0); - replicate_byte_to_16_buffer.Create(); - glNamedBufferStorage(replicate_byte_to_16_buffer.handle, sizeof(REPLICATE_BYTE_TO_16_TABLE), - &REPLICATE_BYTE_TO_16_TABLE, 0); + glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_BUFFER_DATA), &ASTC_BUFFER_DATA, 0); } +UtilShaders::~UtilShaders() = default; + void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -108,47 +86,51 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, static constexpr GLuint BINDING_BYTE_TO_16_BUFFER = 6; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; - static constexpr GLuint LOC_NUM_IMAGE_BLOCKS = 0; - static constexpr GLuint LOC_BLOCK_DIMS = 1; - const Extent3D tile_size = { - VideoCore::Surface::DefaultBlockWidth(image.info.format), - VideoCore::Surface::DefaultBlockHeight(image.info.format), + const Extent2D tile_size{ + .width = VideoCore::Surface::DefaultBlockWidth(image.info.format), + .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), }; program_manager.BindHostCompute(astc_decoder_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_encodings_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, - replicate_6_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, - replicate_7_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, - replicate_8_to_8_buffer.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, - replicate_byte_to_16_buffer.handle); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, encoding_values), + sizeof(AstcBufferData::encoding_values)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_6_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_6_to_8), + sizeof(AstcBufferData::replicate_6_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_7_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_7_to_8), + sizeof(AstcBufferData::replicate_7_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_8_TO_8_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_8_to_8), + sizeof(AstcBufferData::replicate_8_to_8)); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_BYTE_TO_16_BUFFER, astc_buffer.handle, + offsetof(AstcBufferData, replicate_byte_to_16), + sizeof(AstcBufferData::replicate_byte_to_16)); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); - glUniform2ui(LOC_BLOCK_DIMS, tile_size.width, tile_size.height); + glUniform2ui(1, tile_size.width, tile_size.height); + // Ensure buffer data is valid before dispatching + glFlush(); for (const SwizzleParameters& swizzle : swizzles) { - glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, - GL_WRITE_ONLY, GL_RGBA8); const size_t input_offset = swizzle.buffer_offset + map.offset; - const auto num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const auto num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - - glUniform2ui(LOC_NUM_IMAGE_BLOCKS, swizzle.num_tiles.width, swizzle.num_tiles.height); + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); - // To unswizzle the ASTC data const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); - glUniform3uiv(2, 1, params.origin.data()); - glUniform3iv(3, 1, params.destination.data()); - glUniform1ui(4, params.bytes_per_block_log2); - glUniform1ui(5, params.layer_stride); - glUniform1ui(6, params.block_size); - glUniform1ui(7, params.x_shift); - glUniform1ui(8, params.block_height); - glUniform1ui(9, params.block_height_mask); + ASSERT(params.origin == (std::array{0, 0, 0})); + ASSERT(params.destination == (std::array{0, 0, 0})); + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.layer_stride); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, GL_RGBA8); // ASTC texture data glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); -- cgit v1.2.3