diff options
Diffstat (limited to 'src/video_core/host_shaders')
7 files changed, 361 insertions, 5 deletions
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index c4d459077..8bb429578 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -19,6 +19,7 @@ set(SHADER_FILES block_linear_unswizzle_2d.comp block_linear_unswizzle_3d.comp convert_abgr8_to_d24s8.frag + convert_d32f_to_abgr8.frag convert_d24s8_to_abgr8.frag convert_depth_to_float.frag convert_float_to_depth.frag @@ -41,6 +42,9 @@ set(SHADER_FILES pitch_unswizzle.comp present_bicubic.frag present_gaussian.frag + queries_prefix_scan_sum.comp + queries_prefix_scan_sum_nosubgroups.comp + resolve_conditional_render.comp smaa_edge_detection.vert smaa_edge_detection.frag smaa_blending_weight_calculation.vert @@ -70,6 +74,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND") endif() set(GLSL_FLAGS "") +set(SPIR_V_VERSION "spirv1.3") set(QUIET_FLAG "--quiet") set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) @@ -123,7 +128,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES}) OUTPUT ${SPIRV_HEADER_FILE} COMMAND - ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION} MAIN_DEPENDENCY ${SOURCE_FILE} ) diff --git a/src/video_core/host_shaders/convert_d32f_to_abgr8.frag b/src/video_core/host_shaders/convert_d32f_to_abgr8.frag new file mode 100644 index 000000000..04cfef8b5 --- /dev/null +++ b/src/video_core/host_shaders/convert_d32f_to_abgr8.frag @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#version 450 + +layout(binding = 0) uniform sampler2D depth_tex; + +layout(location = 0) out vec4 color; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + float depth = textureLod(depth_tex, coord, 0).r; + color = vec4(depth, depth, depth, 1.0); +} diff --git a/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp index fc3854d18..66f2ad483 100644 --- a/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp +++ b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp @@ -15,11 +15,14 @@ void main() { // TODO: Specialization constants for num_samples? const int num_samples = imageSamples(msaa_in); + const ivec3 msaa_size = imageSize(msaa_in); + const ivec3 out_size = imageSize(output_img); + const ivec3 scale = out_size / msaa_size; for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) { const vec4 pixel = imageLoad(msaa_in, coords, curr_sample); - const int single_sample_x = 2 * coords.x + (curr_sample & 1); - const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1); + const int single_sample_x = scale.x * coords.x + (curr_sample & 1); + const int single_sample_y = scale.y * coords.y + ((curr_sample / 2) & 1); const ivec3 dest_coords = ivec3(single_sample_x, single_sample_y, coords.z); if (any(greaterThanEqual(dest_coords, imageSize(output_img)))) { diff --git a/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp index dedd962f1..c7ce38efa 100644 --- a/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp +++ b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp @@ -15,9 +15,12 @@ void main() { // TODO: Specialization constants for num_samples? const int num_samples = imageSamples(output_msaa); + const ivec3 msaa_size = imageSize(output_msaa); + const ivec3 out_size = imageSize(img_in); + const ivec3 scale = out_size / msaa_size; for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) { - const int single_sample_x = 2 * coords.x + (curr_sample & 1); - const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1); + const int single_sample_x = scale.x * coords.x + (curr_sample & 1); + const int single_sample_y = scale.y * coords.y + ((curr_sample / 2) & 1); const ivec3 single_coords = ivec3(single_sample_x, single_sample_y, coords.z); if (any(greaterThanEqual(single_coords, imageSize(img_in)))) { diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum.comp b/src/video_core/host_shaders/queries_prefix_scan_sum.comp new file mode 100644 index 000000000..6faa8981f --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum.comp @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#version 460 core + +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_shuffle : require +#extension GL_KHR_shader_subgroup_shuffle_relative : require +#extension GL_KHR_shader_subgroup_arithmetic : require + +#ifdef VULKAN + +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; +END_PUSH_CONSTANTS + +#define LOCAL_RESULTS 8 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[]; +}; + +layout(std430, binding = 1) coherent buffer block2 { + uvec2 output_data[]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[128]; + +// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +// do subgroup Prefix Sum using Hillis and Steele's algorithm +uvec2 subgroupInclusiveAddUint64(uvec2 value) { + uvec2 result = value; + for (uint i = 1; i < gl_SubgroupSize; i *= 2) { + uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; + if (i <= gl_SubgroupInvocationID) { + result = AddUint64(result, other); + } + } + return result; +} + +// Writes down the results to the output buffer and to the accumulation buffer +void WriteResults(uvec2 results[LOCAL_RESULTS]) { + const uint current_id = gl_LocalInvocationID.x; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0); + AddUint64(results[i], base_data); + } + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i]; + } + uint index = accumulation_limit % LOCAL_RESULTS; + uint base_id = accumulation_limit / LOCAL_RESULTS; + if (min_accumulation_base >= accumulation_limit + 1) { + if (current_id == base_id) { + accumulated_data = results[index]; + } + return; + } + // We have that ugly case in which the accumulation data is reset in the middle somewhere. + barrier(); + groupMemoryBarrier(); + + if (current_id == base_id) { + uvec2 reset_value = output_data[max_accumulation_base - 1]; + // Calculate two complement / negate manually + reset_value = AddUint64(uvec2(1,0), ~reset_value); + accumulated_data = AddUint64(results[index], reset_value); + } +} + +void main() { + const uint subgroup_inv_id = gl_SubgroupInvocationID; + const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups; + const uint last_subgroup_id = subgroupMax(subgroup_inv_id); + const uint current_id = gl_LocalInvocationID.x; + const uint total_work = accumulation_limit; + const uint last_result_id = LOCAL_RESULTS - 1; + uvec2 data[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i]; + } + uvec2 results[LOCAL_RESULTS]; + results[0] = data[0]; + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(data[i], results[i - 1]); + } + // make sure all input data has been loaded + subgroupBarrier(); + subgroupMemoryBarrier(); + + // on the last local result, do a subgroup inclusive scan sum + results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]); + // get the last local result from the subgroup behind the current + uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1); + if (subgroup_inv_id != 0) { + for (uint i = 1; i < LOCAL_RESULTS; i++) { + results[i - 1] = AddUint64(results[i - 1], result_behind); + } + } + + // if we had less queries than our subgroup, just write down the results. + if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch. + WriteResults(results); + return; + } + + // We now have more, so lets write the last result into shared memory. + // Only pick the last subgroup. + if (subgroup_inv_id == last_subgroup_id) { + shared_data[subgroup_id] = results[last_result_id]; + } + // wait until everyone loaded their stuffs + barrier(); + memoryBarrierShared(); + + // only if it's not the first subgroup + if (subgroup_id != 0) { + // get the results from some previous invocation + uvec2 tmp = shared_data[subgroup_inv_id]; + subgroupBarrier(); + subgroupMemoryBarrierShared(); + tmp = subgroupInclusiveAddUint64(tmp); + // obtain the result that would be equivalent to the previous result + uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1); + for (uint i = 0; i < LOCAL_RESULTS; i++) { + results[i] = AddUint64(results[i], shuffled_result); + } + } + WriteResults(results); +}
\ No newline at end of file diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp new file mode 100644 index 000000000..559a213b9 --- /dev/null +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp @@ -0,0 +1,138 @@ +// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel +// SPDX-License-Identifier: MIT + +// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and +// Nicholas Haemel. Modified to suit needs. + +#version 460 core + +#ifdef VULKAN + +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout(location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; +END_PUSH_CONSTANTS + +#define LOCAL_RESULTS 4 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; + +layout(std430, binding = 0) readonly buffer block1 { + uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; +}; + +layout(std430, binding = 1) writeonly coherent buffer block2 { + uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; +}; + +layout(std430, binding = 2) coherent buffer block3 { + uvec2 accumulated_data; +}; + +shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; + +uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { + uint carry = 0; + uvec2 result; + result.x = uaddCarry(value_1.x, value_2.x, carry); + result.y = value_1.y + value_2.y + carry; + return result; +} + +void main(void) { + uint id = gl_LocalInvocationID.x; + uvec2 base_value[LOCAL_RESULTS]; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base + ? accumulated_data + : uvec2(0); + } + uint work_size = gl_WorkGroupSize.x; + uint rd_id; + uint wr_id; + uint mask; + uvec2 inputs[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; + } + // The number of steps is the log base 2 of the + // work group size, which should be a power of 2 + const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); + uint step = 0; + + // Each invocation is responsible for the content of + // two elements of the output array + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = inputs[i]; + } + // Synchronize to make sure that everyone has initialized + // their elements of shared_data[] with data loaded from + // the input arrays + barrier(); + memoryBarrierShared(); + // For each step... + for (step = 0; step < steps; step++) { + // Calculate the read and write index in the + // shared array + mask = (1 << step) - 1; + rd_id = ((id >> step) << (step + 1)) + mask; + wr_id = rd_id + 1 + (id & mask); + // Accumulate the read data into our element + + shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); + // Synchronize again to make sure that everyone + // has caught up with us + barrier(); + memoryBarrierShared(); + } + // Add the accumulation + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = + AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); + } + barrier(); + memoryBarrierShared(); + + // Finally write our data back to the output buffer + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; + } + if (id == 0) { + if (min_accumulation_base >= accumulation_limit + 1) { + accumulated_data = shared_data[accumulation_limit]; + return; + } + uvec2 reset_value = shared_data[max_accumulation_base - 1]; + uvec2 final_value = shared_data[accumulation_limit]; + // Two complements + reset_value = AddUint64(uvec2(1, 0), ~reset_value); + accumulated_data = AddUint64(final_value, reset_value); + } +}
\ No newline at end of file diff --git a/src/video_core/host_shaders/resolve_conditional_render.comp b/src/video_core/host_shaders/resolve_conditional_render.comp new file mode 100644 index 000000000..307e77d1a --- /dev/null +++ b/src/video_core/host_shaders/resolve_conditional_render.comp @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#version 450 + +layout(local_size_x = 1) in; + +layout(std430, binding = 0) buffer Query { + uvec2 initial; + uvec2 unknown; + uvec2 current; +}; + +layout(std430, binding = 1) buffer Result { + uint result; +}; + +void main() { + result = all(equal(initial, current)) ? 1 : 0; +} |
