From 5665d055476fa793192523c3cb6fe06369d58674 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sun, 4 Jul 2021 22:48:41 -0400 Subject: astc_decoder: Optimize the use EncodingData This buffer was a list of EncodingData structures sorted by their bit length, with some duplication from the cpu decoder implementation. We can take advantage of its sorted property to optimize its usage in the shader. Thanks to wwylele for the optimization idea. --- src/video_core/host_shaders/astc_decoder.comp | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'src/video_core/host_shaders') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index c37f15bfd..7f4efa31a 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -10,18 +10,16 @@ #define END_PUSH_CONSTANTS }; #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 -#define BINDING_ENC_BUFFER 1 -#define BINDING_SWIZZLE_BUFFER 2 -#define BINDING_OUTPUT_IMAGE 3 +#define BINDING_SWIZZLE_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv #define BEGIN_PUSH_CONSTANTS #define END_PUSH_CONSTANTS #define UNIFORM(n) layout(location = n) uniform -#define BINDING_SWIZZLE_BUFFER 0 -#define BINDING_INPUT_BUFFER 1 -#define BINDING_ENC_BUFFER 2 +#define BINDING_INPUT_BUFFER 0 +#define BINDING_SWIZZLE_BUFFER 1 #define BINDING_OUTPUT_IMAGE 0 #endif @@ -64,11 +62,6 @@ layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint astc_data[]; }; -// ASTC Encodings data -layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { - EncodingData encoding_values[]; -}; - layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; const uint GOB_SIZE_X = 64; @@ -94,6 +87,19 @@ const int JUST_BITS = 0; const int QUINT = 1; const int TRIT = 2; +// ASTC Encodings data, sorted in ascending order based on their BitLength value +// (see GetBitLength() function) +EncodingData encoding_values[22] = EncodingData[]( + EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), + EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), + EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), + EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), + EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), + EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), + EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0), + EncodingData(JUST_BITS, 8, 0, 0) +); + // The following constants are expanded variants of the Replicate() // function calls corresponding to the following arguments: // value: index into the generated table @@ -596,22 +602,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; } - int range = 256; - while (--range > 0) { - EncodingData val = encoding_values[range]; + // Find the largest encoding that's within color_data_bits + // TODO(ameerj): profile with binary search + int range = 0; + while (++range < encoding_values.length()) { uint bit_length = GetBitLength(num_values, range); - if (bit_length <= color_data_bits) { - while (--range > 0) { - EncodingData newval = encoding_values[range]; - if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { - break; - } - } - ++range; + if (bit_length > color_data_bits) { break; } } - DecodeIntegerSequence(range, num_values); + DecodeIntegerSequence(range - 1, num_values); uint out_index = 0; for (int itr = 0; itr < result_index; ++itr) { if (out_index >= num_values) { @@ -1110,10 +1110,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { } weight_index -= 2; if ((mode_layout != 9) && ((mode & 0x200) != 0)) { - const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31); + const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); params.max_weight = max_weights[weight_index]; } else { - const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7); + const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6); params.max_weight = max_weights[weight_index]; } return params; -- cgit v1.2.3 From a75d70fa9025baad7a80b700903148d1152b1b84 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Mon, 5 Jul 2021 18:51:51 -0400 Subject: astc_decoder: Simplify Select2DPartition --- src/video_core/host_shaders/astc_decoder.comp | 57 +++++++++------------------ 1 file changed, 19 insertions(+), 38 deletions(-) (limited to 'src/video_core/host_shaders') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 7f4efa31a..8d8b64fbd 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -284,14 +284,10 @@ uint Hash52(uint p) { return p; } -uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { - if (partition_count == 1) { - return 0; - } +uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { if (small_block) { x <<= 1; y <<= 1; - z <<= 1; } seed += (partition_count - 1) * 1024; @@ -305,10 +301,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo uint seed6 = uint((rnum >> 20) & 0xF); uint seed7 = uint((rnum >> 24) & 0xF); uint seed8 = uint((rnum >> 28) & 0xF); - uint seed9 = uint((rnum >> 18) & 0xF); - uint seed10 = uint((rnum >> 22) & 0xF); - uint seed11 = uint((rnum >> 26) & 0xF); - uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF); seed1 = (seed1 * seed1); seed2 = (seed2 * seed2); @@ -318,12 +310,8 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo seed6 = (seed6 * seed6); seed7 = (seed7 * seed7); seed8 = (seed8 * seed8); - seed9 = (seed9 * seed9); - seed10 = (seed10 * seed10); - seed11 = (seed11 * seed11); - seed12 = (seed12 * seed12); - int sh1, sh2, sh3; + uint sh1, sh2; if ((seed & 1) > 0) { sh1 = (seed & 2) > 0 ? 4 : 5; sh2 = (partition_count == 3) ? 6 : 5; @@ -331,25 +319,19 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo sh1 = (partition_count == 3) ? 6 : 5; sh2 = (seed & 2) > 0 ? 4 : 5; } - sh3 = (seed & 0x10) > 0 ? sh1 : sh2; - - seed1 = (seed1 >> sh1); - seed2 = (seed2 >> sh2); - seed3 = (seed3 >> sh1); - seed4 = (seed4 >> sh2); - seed5 = (seed5 >> sh1); - seed6 = (seed6 >> sh2); - seed7 = (seed7 >> sh1); - seed8 = (seed8 >> sh2); - seed9 = (seed9 >> sh3); - seed10 = (seed10 >> sh3); - seed11 = (seed11 >> sh3); - seed12 = (seed12 >> sh3); - - uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); - uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); - uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); - uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + seed1 >>= sh1; + seed2 >>= sh2; + seed3 >>= sh1; + seed4 >>= sh2; + seed5 >>= sh1; + seed6 >>= sh2; + seed7 >>= sh1; + seed8 >>= sh2; + + uint a = seed1 * x + seed2 * y + (rnum >> 14); + uint b = seed3 * x + seed4 * y + (rnum >> 10); + uint c = seed5 * x + seed6 * y + (rnum >> 6); + uint d = seed7 * x + seed8 * y + (rnum >> 2); a &= 0x3F; b &= 0x3F; @@ -374,10 +356,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo } } -uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { - return SelectPartition(seed, x, y, 0, partition_count, small_block); -} - uint ReadBit() { if (current_index >= local_buff.length()) { return 0; @@ -1281,8 +1259,11 @@ void DecompressBlock(ivec3 coord, uint block_index) { for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { - uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, + uint local_partition = 0; + if (num_partitions > 1) { + local_partition = Select2DPartition(partition_index, i, j, num_partitions, (block_dims.y * block_dims.x) < 32); + } vec4 p; uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); -- cgit v1.2.3 From b2862e4772489f0ade41f79765d33ec4fc33712c Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Thu, 8 Jul 2021 00:31:35 -0400 Subject: astc_decoder: Make use of uvec4 for payload data --- src/video_core/host_shaders/astc_decoder.comp | 122 +++++++++----------------- 1 file changed, 43 insertions(+), 79 deletions(-) (limited to 'src/video_core/host_shaders') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 8d8b64fbd..392f09c68 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -59,7 +59,7 @@ layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { }; layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { - uint astc_data[]; + uvec4 astc_data[]; }; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; @@ -141,32 +141,28 @@ const uint REPLICATE_7_BIT_TO_8_TABLE[128] = // Input ASTC texture globals uint current_index = 0; int bitsread = 0; -uint total_bitsread = 0; -uint local_buff[16]; +int total_bitsread = 0; +uvec4 local_buff; // Color data globals -uint color_endpoint_data[16]; +uvec4 color_endpoint_data; int color_bitsread = 0; -uint total_color_bitsread = 0; -int color_index = 0; // Four values, two endpoints, four maximum paritions uint color_values[32]; int colvals_index = 0; // Weight data globals -uint texel_weight_data[16]; +uvec4 texel_weight_data; int texel_bitsread = 0; -uint total_texel_bitsread = 0; -int texel_index = 0; bool texel_flag = false; // Global "vectors" to be pushed into when decoding -EncodingData result_vector[100]; +EncodingData result_vector[144]; int result_index = 0; -EncodingData texel_vector[100]; +EncodingData texel_vector[144]; int texel_vector_index = 0; uint unquantized_texel_weights[2][144]; @@ -176,11 +172,6 @@ uint SwizzleOffset(uvec2 pos) { return swizzle_table[pos.y * 64 + pos.x]; } -uint ReadTexel(uint offset) { - // extract the 8-bit value from the 32-bit packed data. - return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); -} - // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] // is the same as [(num_bits - 1):0] and repeats all the way down. uint Replicate(uint val, uint num_bits, uint to_bit) { @@ -356,54 +347,37 @@ uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool sma } } -uint ReadBit() { - if (current_index >= local_buff.length()) { +uint ExtractBits(uvec4 payload, int offset, int bits) { + if (bits <= 0) { return 0; } - uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); - ++bitsread; - ++total_bitsread; - if (bitsread == 8) { - ++current_index; - bitsread = 0; + int last_offset = offset + bits - 1; + int shifted_offset = offset >> 5; + if ((last_offset >> 5) == shifted_offset) { + return bitfieldExtract(payload[shifted_offset], offset & 31, bits); } - return bit; + int first_bits = 32 - (offset & 31); + int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); + int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); + return result_first | (result_second << first_bits); } uint StreamBits(uint num_bits) { - uint ret = 0; - for (uint i = 0; i < num_bits; i++) { - ret |= ((ReadBit() & 1) << i); - } + int int_bits = int(num_bits); + uint ret = ExtractBits(local_buff, total_bitsread, int_bits); + total_bitsread += int_bits; return ret; } -uint ReadColorBit() { - uint bit = 0; - if (texel_flag) { - bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); - ++texel_bitsread; - ++total_texel_bitsread; - if (texel_bitsread == 8) { - ++texel_index; - texel_bitsread = 0; - } - } else { - bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); - ++color_bitsread; - ++total_color_bitsread; - if (color_bitsread == 8) { - ++color_index; - color_bitsread = 0; - } - } - return bit; -} - uint StreamColorBits(uint num_bits) { uint ret = 0; - for (uint i = 0; i < num_bits; i++) { - ret |= ((ReadColorBit() & 1) << i); + int int_bits = int(num_bits); + if (texel_flag) { + ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits); + texel_bitsread += int_bits; + } else { + ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); + color_bitsread += int_bits; } return ret; } @@ -1006,7 +980,7 @@ int FindLayout(uint mode) { return 5; } -TexelWeightParams DecodeBlockInfo(uint block_index) { +TexelWeightParams DecodeBlockInfo() { TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); uint mode = StreamBits(11); if ((mode & 0x1ff) == 0x1fc) { @@ -1122,8 +1096,8 @@ void FillVoidExtentLDR(ivec3 coord) { } } -void DecompressBlock(ivec3 coord, uint block_index) { - TexelWeightParams params = DecodeBlockInfo(block_index); +void DecompressBlock(ivec3 coord) { + TexelWeightParams params = DecodeBlockInfo(); if (params.error_state) { FillError(coord); return; @@ -1190,7 +1164,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { // Read color data... uint color_data_bits = remaining_bits; while (remaining_bits > 0) { - int nb = int(min(remaining_bits, 8U)); + int nb = int(min(remaining_bits, 32U)); uint b = StreamBits(nb); color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); ++ced_pointer; @@ -1232,25 +1206,20 @@ void DecompressBlock(ivec3 coord, uint block_index) { ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); } - for (uint i = 0; i < 16; i++) { - texel_weight_data[i] = local_buff[i]; - } - for (uint i = 0; i < 8; i++) { -#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16 - uint a = REVERSE_BYTE(texel_weight_data[i]); - uint b = REVERSE_BYTE(texel_weight_data[15 - i]); -#undef REVERSE_BYTE - texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8)); - texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8)); - } + texel_weight_data = local_buff; + texel_weight_data = bitfieldReverse(texel_weight_data).wzyx; uint clear_byte_start = (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; - texel_weight_data[clear_byte_start - 1] = - texel_weight_data[clear_byte_start - 1] & + + uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) & uint( ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); - for (uint i = 0; i < 16 - clear_byte_start; i++) { - texel_weight_data[clear_byte_start + i] = 0U; + uint vec_index = (clear_byte_start - 1) >> 2; + texel_weight_data[vec_index] = + bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); + for (uint i = clear_byte_start; i < 16; ++i) { + uint idx = i >> 2; + texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8); } texel_flag = true; // use texel "vector" and bit stream in integer decoding DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); @@ -1302,13 +1271,8 @@ void main() { if (any(greaterThanEqual(coord, imageSize(dest_image)))) { return; } - uint block_index = - pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; - current_index = 0; bitsread = 0; - for (int i = 0; i < 16; i++) { - local_buff[i] = ReadTexel(offset + i); - } - DecompressBlock(coord, block_index); + local_buff = astc_data[offset / 16]; + DecompressBlock(coord); } -- cgit v1.2.3 From 5ab80535118e593ef3add3ce2b5935437e1dc1d3 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 31 Jul 2021 22:24:15 -0400 Subject: astc_decoder: Compute offset swizzles in-shader Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes. --- src/video_core/host_shaders/astc_decoder.comp | 46 +++++---------- src/video_core/renderer_opengl/util_shaders.cpp | 16 +++--- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 67 ++-------------------- src/video_core/renderer_vulkan/vk_compute_pass.h | 5 -- 4 files changed, 25 insertions(+), 109 deletions(-) (limited to 'src/video_core/host_shaders') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 392f09c68..74ce058a9 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -10,8 +10,7 @@ #define END_PUSH_CONSTANTS }; #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 -#define BINDING_SWIZZLE_BUFFER 1 -#define BINDING_OUTPUT_IMAGE 2 +#define BINDING_OUTPUT_IMAGE 1 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv @@ -19,7 +18,6 @@ #define END_PUSH_CONSTANTS #define UNIFORM(n) layout(location = n) uniform #define BINDING_INPUT_BUFFER 0 -#define BINDING_SWIZZLE_BUFFER 1 #define BINDING_OUTPUT_IMAGE 0 #endif @@ -28,13 +26,11 @@ layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; BEGIN_PUSH_CONSTANTS UNIFORM(1) uvec2 block_dims; - -UNIFORM(2) uint bytes_per_block_log2; -UNIFORM(3) uint layer_stride; -UNIFORM(4) uint block_size; -UNIFORM(5) uint x_shift; -UNIFORM(6) uint block_height; -UNIFORM(7) uint block_height_mask; +UNIFORM(2) uint layer_stride; +UNIFORM(3) uint block_size; +UNIFORM(4) uint x_shift; +UNIFORM(5) uint block_height; +UNIFORM(6) uint block_height_mask; END_PUSH_CONSTANTS struct EncodingData { @@ -53,35 +49,17 @@ struct TexelWeightParams { bool void_extent_hdr; }; -// Swizzle data -layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { - uint swizzle_table[]; -}; - layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uvec4 astc_data[]; }; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; -const uint GOB_SIZE_X = 64; -const uint GOB_SIZE_Y = 8; -const uint GOB_SIZE_Z = 1; -const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; - const uint GOB_SIZE_X_SHIFT = 6; const uint GOB_SIZE_Y_SHIFT = 3; -const uint GOB_SIZE_Z_SHIFT = 0; -const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; - -const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); - -const int BLOCK_SIZE_IN_BYTES = 16; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; -const int BLOCK_INFO_ERROR = 0; -const int BLOCK_INFO_VOID_EXTENT_HDR = 1; -const int BLOCK_INFO_VOID_EXTENT_LDR = 2; -const int BLOCK_INFO_NORMAL = 3; +const uint BYTES_PER_BLOCK_LOG2 = 4; const int JUST_BITS = 0; const int QUINT = 1; @@ -168,8 +146,10 @@ int texel_vector_index = 0; uint unquantized_texel_weights[2][144]; uint SwizzleOffset(uvec2 pos) { - pos = pos & SWIZZLE_MASK; - return swizzle_table[pos.y * 64 + pos.x]; + uint x = pos.x; + uint y = pos.y; + return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + + (y % 2) * 16 + (x % 16); } // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] @@ -1253,7 +1233,7 @@ void DecompressBlock(ivec3 coord) { void main() { uvec3 pos = gl_GlobalInvocationID; - pos.x <<= bytes_per_block_log2; + pos.x <<= BYTES_PER_BLOCK_LOG2; // Read as soon as possible due to its latency const uint swizzle = SwizzleOffset(pos.xy); diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index a2b264700..4e6f7cb00 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -68,7 +68,6 @@ UtilShaders::~UtilShaders() = default; void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, std::span swizzles) { static constexpr GLuint BINDING_INPUT_BUFFER = 0; - static constexpr GLuint BINDING_SWIZZLE_BUFFER = 1; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; const Extent2D tile_size{ @@ -76,10 +75,9 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), }; program_manager.BindComputeProgram(astc_decoder_program.handle); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); - glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); glUniform2ui(1, tile_size.width, tile_size.height); + // Ensure buffer data is valid before dispatching glFlush(); for (const SwizzleParameters& swizzle : swizzles) { @@ -90,13 +88,13 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); ASSERT(params.origin == (std::array{0, 0, 0})); ASSERT(params.destination == (std::array{0, 0, 0})); + ASSERT(params.bytes_per_block_log2 == 4); - glUniform1ui(2, params.bytes_per_block_log2); - glUniform1ui(3, params.layer_stride); - glUniform1ui(4, params.block_size); - glUniform1ui(5, params.x_shift); - glUniform1ui(6, params.block_height); - glUniform1ui(7, params.block_height_mask); + glUniform1ui(2, params.layer_stride); + glUniform1ui(3, params.block_size); + glUniform1ui(4, params.x_shift); + glUniform1ui(5, params.block_height); + glUniform1ui(6, params.block_height_mask); // ASTC texture data glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 328813a57..d13d58e8c 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -34,9 +34,8 @@ using Tegra::Texture::SWIZZLE_TABLE; namespace { constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; -constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 1; -constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 2; -constexpr size_t ASTC_NUM_BINDINGS = 3; +constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1; +constexpr size_t ASTC_NUM_BINDINGS = 2; template inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ @@ -80,13 +79,6 @@ constexpr std::array ASTC_DESCR .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .pImmutableSamplers = nullptr, }, - { - .binding = ASTC_BINDING_SWIZZLE_BUFFER, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }, { .binding = ASTC_BINDING_OUTPUT_IMAGE, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, @@ -98,12 +90,12 @@ constexpr std::array ASTC_DESCR constexpr DescriptorBankInfo ASTC_BANK_INFO{ .uniform_buffers = 0, - .storage_buffers = 2, + .storage_buffers = 1, .texture_buffers = 0, .image_buffers = 0, .textures = 0, .images = 1, - .score = 3, + .score = 2, }; constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ @@ -125,14 +117,6 @@ constexpr std::array .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry), }, - { - .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), - .stride = sizeof(DescriptorUpdateEntry), - }, { .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, .dstArrayElement = 0, @@ -145,7 +129,6 @@ constexpr std::array struct AstcPushConstants { std::array blocks_dims; - u32 bytes_per_block_log2; u32 layer_stride; u32 block_size; u32 x_shift; @@ -336,42 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, ASTCDecoderPass::~ASTCDecoderPass() = default; -void ASTCDecoderPass::MakeDataBuffer() { - constexpr size_t TOTAL_BUFFER_SIZE = sizeof(SWIZZLE_TABLE); - data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = TOTAL_BUFFER_SIZE, - .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }); - data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); - - const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); - std::memcpy(staging_ref.mapped_span.data(), &SWIZZLE_TABLE, sizeof(SWIZZLE_TABLE)); - - scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, - TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { - static constexpr VkMemoryBarrier write_barrier{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, - }; - const VkBufferCopy copy{ - .srcOffset = offset, - .dstOffset = 0, - .size = TOTAL_BUFFER_SIZE, - }; - cmdbuf.CopyBuffer(src, dst, copy); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - 0, write_barrier); - }); -} - void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, std::span swizzles) { using namespace VideoCommon::Accelerated; @@ -380,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, VideoCore::Surface::DefaultBlockHeight(image.info.format), }; scheduler.RequestOutsideRenderPassOperationContext(); - if (!data_buffer) { - MakeDataBuffer(); - } const VkPipeline vk_pipeline = *pipeline; const VkImageAspectFlags aspect_mask = image.AspectMask(); const VkImage vk_image = image.Handle(); @@ -421,7 +365,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(map.buffer, input_offset, image.guest_size_bytes - swizzle.buffer_offset); - update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(SWIZZLE_TABLE)); update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); const void* const descriptor_data{update_descriptor_queue.UpdateData()}; @@ -429,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); ASSERT(params.origin == (std::array{0, 0, 0})); ASSERT(params.destination == (std::array{0, 0, 0})); + ASSERT(params.bytes_per_block_log2 == 4); scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, params, descriptor_data](vk::CommandBuffer cmdbuf) { const AstcPushConstants uniforms{ .blocks_dims = block_dims, - .bytes_per_block_log2 = params.bytes_per_block_log2, .layer_stride = params.layer_stride, .block_size = params.block_size, .x_shift = params.x_shift, diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index 114aef2bd..c7b92cce0 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -96,15 +96,10 @@ public: std::span swizzles); private: - void MakeDataBuffer(); - VKScheduler& scheduler; StagingBufferPool& staging_buffer_pool; VKUpdateDescriptorQueue& update_descriptor_queue; MemoryAllocator& memory_allocator; - - vk::Buffer data_buffer; - MemoryCommit data_buffer_commit; }; } // namespace Vulkan -- cgit v1.2.3 From c439fc9be994583801418743ab202fb63d1c83a0 Mon Sep 17 00:00:00 2001 From: ameerj <52414509+ameerj@users.noreply.github.com> Date: Sat, 31 Jul 2021 23:55:20 -0400 Subject: astc_decoder: Reduce workgroup size This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32. --- src/video_core/host_shaders/astc_decoder.comp | 2 +- src/video_core/renderer_opengl/util_shaders.cpp | 4 ++-- src/video_core/renderer_vulkan/vk_compute_pass.cpp | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'src/video_core/host_shaders') diff --git a/src/video_core/host_shaders/astc_decoder.comp b/src/video_core/host_shaders/astc_decoder.comp index 74ce058a9..f34c5f5d9 100644 --- a/src/video_core/host_shaders/astc_decoder.comp +++ b/src/video_core/host_shaders/astc_decoder.comp @@ -22,7 +22,7 @@ #endif -layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; +layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; BEGIN_PUSH_CONSTANTS UNIFORM(1) uvec2 block_dims; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 4e6f7cb00..333f35a1c 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -82,8 +82,8 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, glFlush(); for (const SwizzleParameters& swizzle : swizzles) { const size_t input_offset = swizzle.buffer_offset + map.offset; - const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); ASSERT(params.origin == (std::array{0, 0, 0})); diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index d13d58e8c..3e96c0f60 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -358,8 +358,8 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, }); for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { const size_t input_offset = swizzle.buffer_offset + map.offset; - const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); - const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); + const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); + const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); const u32 num_dispatches_z = image.info.resources.layers; update_descriptor_queue.Acquire(); -- cgit v1.2.3