From 8d6aefdcc452b602d94a84d13bbbc15f806b689c Mon Sep 17 00:00:00 2001 From: Liam Date: Wed, 14 Jun 2023 14:11:46 -0400 Subject: video_core: optionally skip barriers on feedback loops --- src/video_core/texture_cache/texture_cache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c7f7448e9..43b7ac0a6 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -186,6 +186,10 @@ void TextureCache

::FillComputeImageViews(std::span views) { template void TextureCache

::CheckFeedbackLoop(std::span views) { + if (!Settings::values.barrier_feedback_loops.GetValue()) { + return; + } + const bool requires_barrier = [&] { for (const auto& view : views) { if (!view.id) { -- cgit v1.2.3 From 76a676883a17523fb12eeac6f2b9702e4916b2c2 Mon Sep 17 00:00:00 2001 From: FengChen Date: Sat, 17 Jun 2023 23:26:39 +0800 Subject: video_core: add samples check when find render target --- src/video_core/texture_cache/texture_cache.h | 22 ++++++++++------------ src/video_core/texture_cache/texture_cache_base.h | 10 ++++------ 2 files changed, 14 insertions(+), 18 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index c7f7448e9..f11998e20 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -280,7 +280,7 @@ void TextureCache

::SynchronizeComputeDescriptors() { } template -bool TextureCache

::RescaleRenderTargets(bool is_clear) { +bool TextureCache

::RescaleRenderTargets() { auto& flags = maxwell3d->dirty.flags; u32 scale_rating = 0; bool rescaled = false; @@ -318,13 +318,13 @@ bool TextureCache

::RescaleRenderTargets(bool is_clear) { ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; if (flags[Dirty::ColorBuffer0 + index] || force) { flags[Dirty::ColorBuffer0 + index] = false; - BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + BindRenderTarget(&color_buffer_id, FindColorBuffer(index)); } check_rescale(color_buffer_id, tmp_color_images[index]); } if (flags[Dirty::ZetaBuffer] || force) { flags[Dirty::ZetaBuffer] = false; - BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer()); } check_rescale(render_targets.depth_buffer_id, tmp_depth_image); @@ -389,7 +389,7 @@ void TextureCache

::UpdateRenderTargets(bool is_clear) { return; } - const bool rescaled = RescaleRenderTargets(is_clear); + const bool rescaled = RescaleRenderTargets(); if (is_rescaling != rescaled) { flags[Dirty::RescaleViewports] = true; flags[Dirty::RescaleScissors] = true; @@ -1658,7 +1658,7 @@ SamplerId TextureCache

::FindSampler(const TSCEntry& config) { } template -ImageViewId TextureCache

::FindColorBuffer(size_t index, bool is_clear) { +ImageViewId TextureCache

::FindColorBuffer(size_t index) { const auto& regs = maxwell3d->regs; if (index >= regs.rt_control.count) { return ImageViewId{}; @@ -1672,11 +1672,11 @@ ImageViewId TextureCache

::FindColorBuffer(size_t index, bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.rt[index], regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template -ImageViewId TextureCache

::FindDepthBuffer(bool is_clear) { +ImageViewId TextureCache

::FindDepthBuffer() { const auto& regs = maxwell3d->regs; if (!regs.zeta_enable) { return ImageViewId{}; @@ -1686,18 +1686,16 @@ ImageViewId TextureCache

::FindDepthBuffer(bool is_clear) { return ImageViewId{}; } const ImageInfo info(regs.zeta, regs.zeta_size, regs.anti_alias_samples_mode); - return FindRenderTargetView(info, gpu_addr, is_clear); + return FindRenderTargetView(info, gpu_addr); } template -ImageViewId TextureCache

::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear) { - const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; +ImageViewId TextureCache

::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr) { ImageId image_id{}; bool delete_state = has_deleted_images; do { has_deleted_images = false; - image_id = FindOrInsertImage(info, gpu_addr, options); + image_id = FindOrInsertImage(info, gpu_addr); delete_state |= has_deleted_images; } while (has_deleted_images); has_deleted_images = delete_state; diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 3bfa92154..c347eccd6 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -166,9 +166,8 @@ public: void SynchronizeComputeDescriptors(); /// Updates the Render Targets if they can be rescaled - /// @param is_clear True when the render targets are being used for clears /// @retval True if the Render Targets have been rescaled. - bool RescaleRenderTargets(bool is_clear); + bool RescaleRenderTargets(); /// Update bound render targets and upload memory if necessary /// @param is_clear True when the render targets are being used for clears @@ -324,14 +323,13 @@ private: [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); /// Find or create an image view for the given color buffer index - [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear); + [[nodiscard]] ImageViewId FindColorBuffer(size_t index); /// Find or create an image view for the depth buffer - [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear); + [[nodiscard]] ImageViewId FindDepthBuffer(); /// Find or create a view for a render target with the given image parameters - [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, - bool is_clear); + [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr); /// Iterates over all the images in a region calling func template -- cgit v1.2.3 From 5da70f719703084482933e103e561cc98163f370 Mon Sep 17 00:00:00 2001 From: Kelebek1 Date: Tue, 23 May 2023 14:45:54 +0100 Subject: Remove memory allocations in some hot paths --- src/audio_core/device/audio_buffers.h | 8 +-- src/audio_core/device/device_session.cpp | 12 ++--- src/audio_core/device/device_session.h | 7 +-- src/audio_core/in/audio_in_system.cpp | 5 +- src/audio_core/out/audio_out_system.cpp | 4 +- .../renderer/command/data_source/decode.cpp | 23 ++++----- .../renderer/command/effect/compressor.cpp | 8 +-- src/audio_core/renderer/command/effect/delay.cpp | 14 ++--- .../renderer/command/effect/i3dl2_reverb.cpp | 4 +- .../renderer/command/effect/light_limiter.cpp | 12 ++--- src/audio_core/renderer/command/effect/reverb.cpp | 12 ++--- .../renderer/command/sink/circular_buffer.cpp | 4 +- src/audio_core/renderer/command/sink/device.cpp | 5 +- src/audio_core/renderer/mix/mix_context.cpp | 6 +-- src/audio_core/renderer/nodes/node_states.cpp | 4 +- src/audio_core/renderer/nodes/node_states.h | 2 +- src/audio_core/renderer/system.cpp | 1 + src/audio_core/sink/null_sink.h | 2 +- src/audio_core/sink/sink_stream.cpp | 15 +++--- src/audio_core/sink/sink_stream.h | 5 +- src/common/ring_buffer.h | 3 +- src/common/scratch_buffer.h | 9 ++++ src/core/hle/kernel/k_synchronization_object.cpp | 3 +- src/core/hle/kernel/k_thread.cpp | 8 ++- src/core/hle/kernel/k_thread.h | 3 +- src/core/hle/kernel/svc/svc_ipc.cpp | 7 +-- src/core/hle/kernel/svc/svc_synchronization.cpp | 10 ++-- src/core/hle/kernel/svc/svc_thread.cpp | 2 +- src/core/hle/service/audio/audin_u.cpp | 16 +++--- src/core/hle/service/audio/audout_u.cpp | 15 ++---- src/core/hle/service/audio/audren_u.cpp | 22 ++++---- src/core/hle/service/audio/audren_u.h | 1 + src/core/hle/service/audio/hwopus.cpp | 9 ++-- src/core/hle/service/nvdrv/devices/nvdevice.h | 6 +-- .../hle/service/nvdrv/devices/nvdisp_disp0.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvdisp_disp0.h | 8 +-- .../hle/service/nvdrv/devices/nvhost_as_gpu.cpp | 31 ++++++------ src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h | 30 ++++++----- src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp | 19 ++++--- src/core/hle/service/nvdrv/devices/nvhost_ctrl.h | 21 ++++---- .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp | 32 ++++++------ .../hle/service/nvdrv/devices/nvhost_ctrl_gpu.h | 38 +++++++------- src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp | 59 +++++++++++----------- src/core/hle/service/nvdrv/devices/nvhost_gpu.h | 36 ++++++------- .../hle/service/nvdrv/devices/nvhost_nvdec.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvhost_nvdec.h | 8 +-- .../service/nvdrv/devices/nvhost_nvdec_common.cpp | 15 +++--- .../service/nvdrv/devices/nvhost_nvdec_common.h | 12 ++--- .../hle/service/nvdrv/devices/nvhost_nvjpg.cpp | 8 +-- src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h | 10 ++-- src/core/hle/service/nvdrv/devices/nvhost_vic.cpp | 6 +-- src/core/hle/service/nvdrv/devices/nvhost_vic.h | 8 +-- src/core/hle/service/nvdrv/devices/nvmap.cpp | 20 ++++---- src/core/hle/service/nvdrv/devices/nvmap.h | 20 ++++---- src/core/hle/service/nvdrv/nvdrv.cpp | 8 +-- src/core/hle/service/nvdrv/nvdrv.h | 8 +-- src/core/hle/service/nvdrv/nvdrv_interface.cpp | 24 ++++----- src/core/hle/service/nvdrv/nvdrv_interface.h | 3 ++ src/core/hle/service/nvnflinger/parcel.h | 7 +-- .../backend/glsl/glsl_emit_context.cpp | 2 +- src/shader_recompiler/backend/spirv/emit_spirv.cpp | 2 +- .../backend/spirv/spirv_emit_context.cpp | 2 +- src/shader_recompiler/runtime_info.h | 3 +- src/video_core/buffer_cache/buffer_cache.h | 4 +- src/video_core/buffer_cache/buffer_cache_base.h | 4 +- src/video_core/cdma_pusher.h | 1 - src/video_core/dma_pusher.h | 8 +-- src/video_core/engines/maxwell_dma.cpp | 35 +++++++------ src/video_core/host1x/codecs/h264.cpp | 4 +- src/video_core/memory_manager.cpp | 13 ++--- src/video_core/memory_manager.h | 15 ++++-- src/video_core/renderer_opengl/gl_shader_cache.cpp | 4 +- src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 2 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 10 +++- .../renderer_vulkan/vk_texture_cache.cpp | 27 +++++----- src/video_core/shader_cache.cpp | 4 +- src/video_core/texture_cache/image_base.h | 5 +- src/video_core/texture_cache/texture_cache.h | 14 ++--- src/video_core/texture_cache/texture_cache_base.h | 4 +- src/video_core/texture_cache/util.cpp | 48 ++++++++++-------- src/video_core/texture_cache/util.h | 31 ++++++------ src/video_core/transform_feedback.cpp | 8 +-- src/video_core/transform_feedback.h | 2 +- src/video_core/vulkan_common/vulkan_device.cpp | 1 + 84 files changed, 503 insertions(+), 460 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/audio_core/device/audio_buffers.h b/src/audio_core/device/audio_buffers.h index 15082f6c6..5d8ed0ef7 100644 --- a/src/audio_core/device/audio_buffers.h +++ b/src/audio_core/device/audio_buffers.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "audio_buffer.h" #include "audio_core/device/device_session.h" @@ -48,7 +49,7 @@ public: * * @param out_buffers - The buffers which were registered. */ - void RegisterBuffers(std::vector& out_buffers) { + void RegisterBuffers(boost::container::static_vector& out_buffers) { std::scoped_lock l{lock}; const s32 to_register{std::min(std::min(appended_count, BufferAppendLimit), BufferAppendLimit - registered_count)}; @@ -162,7 +163,8 @@ public: * @param max_buffers - Maximum number of buffers to released. * @return The number of buffers released. */ - u32 GetRegisteredAppendedBuffers(std::vector& buffers_flushed, u32 max_buffers) { + u32 GetRegisteredAppendedBuffers( + boost::container::static_vector& buffers_flushed, u32 max_buffers) { std::scoped_lock l{lock}; if (registered_count + appended_count == 0) { return 0; @@ -270,7 +272,7 @@ public: */ bool FlushBuffers(u32& buffers_released) { std::scoped_lock l{lock}; - std::vector buffers_flushed{}; + boost::container::static_vector buffers_flushed{}; buffers_released = GetRegisteredAppendedBuffers(buffers_flushed, append_limit); diff --git a/src/audio_core/device/device_session.cpp b/src/audio_core/device/device_session.cpp index b5c0ef0e6..86811fcb8 100644 --- a/src/audio_core/device/device_session.cpp +++ b/src/audio_core/device/device_session.cpp @@ -79,7 +79,7 @@ void DeviceSession::ClearBuffers() { } } -void DeviceSession::AppendBuffers(std::span buffers) const { +void DeviceSession::AppendBuffers(std::span buffers) { for (const auto& buffer : buffers) { Sink::SinkBuffer new_buffer{ .frames = buffer.size / (channel_count * sizeof(s16)), @@ -88,13 +88,13 @@ void DeviceSession::AppendBuffers(std::span buffers) const { .consumed = false, }; + tmp_samples.resize_destructive(buffer.size / sizeof(s16)); if (type == Sink::StreamType::In) { - std::vector samples{}; - stream->AppendBuffer(new_buffer, samples); + stream->AppendBuffer(new_buffer, tmp_samples); } else { - std::vector samples(buffer.size / sizeof(s16)); - system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, samples.data(), buffer.size); - stream->AppendBuffer(new_buffer, samples); + system.ApplicationMemory().ReadBlockUnsafe(buffer.samples, tmp_samples.data(), + buffer.size); + stream->AppendBuffer(new_buffer, tmp_samples); } } } diff --git a/src/audio_core/device/device_session.h b/src/audio_core/device/device_session.h index 75f766c68..7d52f362d 100644 --- a/src/audio_core/device/device_session.h +++ b/src/audio_core/device/device_session.h @@ -10,6 +10,7 @@ #include "audio_core/common/common.h" #include "audio_core/sink/sink.h" +#include "common/scratch_buffer.h" #include "core/hle/service/audio/errors.h" namespace Core { @@ -62,7 +63,7 @@ public: * * @param buffers - The buffers to play. */ - void AppendBuffers(std::span buffers) const; + void AppendBuffers(std::span buffers); /** * (Audio In only) Pop samples from the backend, and write them back to this buffer's address. @@ -146,8 +147,8 @@ private: std::shared_ptr thread_event; /// Is this session initialised? bool initialized{}; - /// Buffer queue - std::vector buffer_queue{}; + /// Temporary sample buffer + Common::ScratchBuffer tmp_samples{}; }; } // namespace AudioCore diff --git a/src/audio_core/in/audio_in_system.cpp b/src/audio_core/in/audio_in_system.cpp index e23e51758..579129121 100644 --- a/src/audio_core/in/audio_in_system.cpp +++ b/src/audio_core/in/audio_in_system.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include + #include "audio_core/audio_event.h" #include "audio_core/audio_manager.h" #include "audio_core/in/audio_in_system.h" @@ -89,7 +90,7 @@ Result System::Start() { session->Start(); state = State::Started; - std::vector buffers_to_flush{}; + boost::container::static_vector buffers_to_flush{}; buffers.RegisterBuffers(buffers_to_flush); session->AppendBuffers(buffers_to_flush); session->SetRingSize(static_cast(buffers_to_flush.size())); @@ -134,7 +135,7 @@ bool System::AppendBuffer(const AudioInBuffer& buffer, const u64 tag) { void System::RegisterBuffers() { if (state == State::Started) { - std::vector registered_buffers{}; + boost::container::static_vector registered_buffers{}; buffers.RegisterBuffers(registered_buffers); session->AppendBuffers(registered_buffers); } diff --git a/src/audio_core/out/audio_out_system.cpp b/src/audio_core/out/audio_out_system.cpp index bd13f7219..0adf64bd3 100644 --- a/src/audio_core/out/audio_out_system.cpp +++ b/src/audio_core/out/audio_out_system.cpp @@ -89,7 +89,7 @@ Result System::Start() { session->Start(); state = State::Started; - std::vector buffers_to_flush{}; + boost::container::static_vector buffers_to_flush{}; buffers.RegisterBuffers(buffers_to_flush); session->AppendBuffers(buffers_to_flush); session->SetRingSize(static_cast(buffers_to_flush.size())); @@ -134,7 +134,7 @@ bool System::AppendBuffer(const AudioOutBuffer& buffer, u64 tag) { void System::RegisterBuffers() { if (state == State::Started) { - std::vector registered_buffers{}; + boost::container::static_vector registered_buffers{}; buffers.RegisterBuffers(registered_buffers); session->AppendBuffers(registered_buffers); } diff --git a/src/audio_core/renderer/command/data_source/decode.cpp b/src/audio_core/renderer/command/data_source/decode.cpp index ff5d31bd6..f45933203 100644 --- a/src/audio_core/renderer/command/data_source/decode.cpp +++ b/src/audio_core/renderer/command/data_source/decode.cpp @@ -8,6 +8,7 @@ #include "audio_core/renderer/command/resample/resample.h" #include "common/fixed_point.h" #include "common/logging/log.h" +#include "common/scratch_buffer.h" #include "core/memory.h" namespace AudioCore::AudioRenderer { @@ -27,6 +28,7 @@ constexpr std::array PitchBySrcQuality = {4, 8, 4}; template static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, const DecodeArg& req) { + std::array tmp_samples{}; constexpr s32 min{std::numeric_limits::min()}; constexpr s32 max{std::numeric_limits::max()}; @@ -49,18 +51,17 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, const u64 size{channel_count * samples_to_decode}; const u64 size_bytes{size * sizeof(T)}; - std::vector samples(size); - memory.ReadBlockUnsafe(source, samples.data(), size_bytes); + memory.ReadBlockUnsafe(source, tmp_samples.data(), size_bytes); if constexpr (std::is_floating_point_v) { for (u32 i = 0; i < samples_to_decode; i++) { - auto sample{static_cast(samples[i * channel_count + req.target_channel] * + auto sample{static_cast(tmp_samples[i * channel_count + req.target_channel] * std::numeric_limits::max())}; out_buffer[i] = static_cast(std::clamp(sample, min, max)); } } else { for (u32 i = 0; i < samples_to_decode; i++) { - out_buffer[i] = samples[i * channel_count + req.target_channel]; + out_buffer[i] = tmp_samples[i * channel_count + req.target_channel]; } } } break; @@ -73,17 +74,16 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, } const VAddr source{req.buffer + ((req.start_offset + req.offset) * sizeof(T))}; - std::vector samples(samples_to_decode); - memory.ReadBlockUnsafe(source, samples.data(), samples_to_decode * sizeof(T)); + memory.ReadBlockUnsafe(source, tmp_samples.data(), samples_to_decode * sizeof(T)); if constexpr (std::is_floating_point_v) { for (u32 i = 0; i < samples_to_decode; i++) { - auto sample{static_cast(samples[i * channel_count + req.target_channel] * + auto sample{static_cast(tmp_samples[i * channel_count + req.target_channel] * std::numeric_limits::max())}; out_buffer[i] = static_cast(std::clamp(sample, min, max)); } } else { - std::memcpy(out_buffer.data(), samples.data(), samples_to_decode * sizeof(s16)); + std::memcpy(out_buffer.data(), tmp_samples.data(), samples_to_decode * sizeof(s16)); } break; } @@ -101,6 +101,7 @@ static u32 DecodePcm(Core::Memory::Memory& memory, std::span out_buffer, */ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span out_buffer, const DecodeArg& req) { + std::array wavebuffer{}; constexpr u32 SamplesPerFrame{14}; constexpr u32 NibblesPerFrame{16}; @@ -138,9 +139,7 @@ static u32 DecodeAdpcm(Core::Memory::Memory& memory, std::span out_buffer, } const auto size{std::max((samples_to_process / 8U) * SamplesPerFrame, 8U)}; - std::vector wavebuffer(size); - memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(), - wavebuffer.size()); + memory.ReadBlockUnsafe(req.buffer + position_in_frame / 2, wavebuffer.data(), size); auto context{req.adpcm_context}; auto header{context->header}; @@ -258,7 +257,7 @@ void DecodeFromWaveBuffers(Core::Memory::Memory& memory, const DecodeFromWaveBuf u32 offset{voice_state.offset}; auto output_buffer{args.output}; - std::vector temp_buffer(TempBufferSize, 0); + std::array temp_buffer{}; while (remaining_sample_count > 0) { const auto samples_to_write{std::min(remaining_sample_count, max_remaining_sample_count)}; diff --git a/src/audio_core/renderer/command/effect/compressor.cpp b/src/audio_core/renderer/command/effect/compressor.cpp index 7229618e8..ee9b68d5b 100644 --- a/src/audio_core/renderer/command/effect/compressor.cpp +++ b/src/audio_core/renderer/command/effect/compressor.cpp @@ -44,8 +44,8 @@ static void InitializeCompressorEffect(const CompressorInfo::ParameterVersion2& static void ApplyCompressorEffect(const CompressorInfo::ParameterVersion2& params, CompressorInfo::State& state, bool enabled, - std::vector> input_buffers, - std::vector> output_buffers, u32 sample_count) { + std::span> input_buffers, + std::span> output_buffers, u32 sample_count) { if (enabled) { auto state_00{state.unk_00}; auto state_04{state.unk_04}; @@ -124,8 +124,8 @@ void CompressorCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& } void CompressorCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (s16 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/delay.cpp b/src/audio_core/renderer/command/effect/delay.cpp index a4e408d40..e536cbb1e 100644 --- a/src/audio_core/renderer/command/effect/delay.cpp +++ b/src/audio_core/renderer/command/effect/delay.cpp @@ -51,7 +51,7 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params, state.delay_lines[channel].sample_count_max = sample_count_max.to_int_floor(); state.delay_lines[channel].sample_count = sample_count.to_int_floor(); state.delay_lines[channel].buffer.resize(state.delay_lines[channel].sample_count, 0); - if (state.delay_lines[channel].buffer.size() == 0) { + if (state.delay_lines[channel].sample_count == 0) { state.delay_lines[channel].buffer.push_back(0); } state.delay_lines[channel].buffer_pos = 0; @@ -74,8 +74,8 @@ static void InitializeDelayEffect(const DelayInfo::ParameterVersion1& params, */ template static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + std::span> inputs, std::span> outputs, + const u32 sample_count) { for (u32 sample_index = 0; sample_index < sample_count; sample_index++) { std::array, NumChannels> input_samples{}; for (u32 channel = 0; channel < NumChannels; channel++) { @@ -153,8 +153,8 @@ static void ApplyDelay(const DelayInfo::ParameterVersion1& params, DelayInfo::St * @param sample_count - Number of samples to process. */ static void ApplyDelayEffect(const DelayInfo::ParameterVersion1& params, DelayInfo::State& state, - const bool enabled, std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + const bool enabled, std::span> inputs, + std::span> outputs, const u32 sample_count) { if (!IsChannelCountValid(params.channel_count)) { LOG_ERROR(Service_Audio, "Invalid delay channels {}", params.channel_count); @@ -208,8 +208,8 @@ void DelayCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proce } void DelayCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (s16 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp index 27d8b9844..d2bfb67cc 100644 --- a/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp +++ b/src/audio_core/renderer/command/effect/i3dl2_reverb.cpp @@ -408,8 +408,8 @@ void I3dl2ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& } void I3dl2ReverbCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/light_limiter.cpp b/src/audio_core/renderer/command/effect/light_limiter.cpp index e8fb0e2fc..4161a9821 100644 --- a/src/audio_core/renderer/command/effect/light_limiter.cpp +++ b/src/audio_core/renderer/command/effect/light_limiter.cpp @@ -47,8 +47,8 @@ static void InitializeLightLimiterEffect(const LightLimiterInfo::ParameterVersio */ static void ApplyLightLimiterEffect(const LightLimiterInfo::ParameterVersion2& params, LightLimiterInfo::State& state, const bool enabled, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count, + std::span> inputs, + std::span> outputs, const u32 sample_count, LightLimiterInfo::StatisticsInternal* statistics) { constexpr s64 min{std::numeric_limits::min()}; constexpr s64 max{std::numeric_limits::max()}; @@ -147,8 +147,8 @@ void LightLimiterVersion1Command::Dump([[maybe_unused]] const ADSP::CommandListP } void LightLimiterVersion1Command::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, @@ -190,8 +190,8 @@ void LightLimiterVersion2Command::Dump([[maybe_unused]] const ADSP::CommandListP } void LightLimiterVersion2Command::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/effect/reverb.cpp b/src/audio_core/renderer/command/effect/reverb.cpp index 8b9b65214..fc2f15a5e 100644 --- a/src/audio_core/renderer/command/effect/reverb.cpp +++ b/src/audio_core/renderer/command/effect/reverb.cpp @@ -250,8 +250,8 @@ static Common::FixedPoint<50, 14> Axfx2AllPassTick(ReverbInfo::ReverbDelayLine& */ template static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state, - std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + std::span> inputs, + std::span> outputs, const u32 sample_count) { static constexpr std::array OutTapIndexes1Ch{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; @@ -369,8 +369,8 @@ static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, Rever * @param sample_count - Number of samples to process. */ static void ApplyReverbEffect(const ReverbInfo::ParameterVersion2& params, ReverbInfo::State& state, - const bool enabled, std::vector>& inputs, - std::vector>& outputs, const u32 sample_count) { + const bool enabled, std::span> inputs, + std::span> outputs, const u32 sample_count) { if (enabled) { switch (params.channel_count) { case 0: @@ -412,8 +412,8 @@ void ReverbCommand::Dump([[maybe_unused]] const ADSP::CommandListProcessor& proc } void ReverbCommand::Process(const ADSP::CommandListProcessor& processor) { - std::vector> input_buffers(parameter.channel_count); - std::vector> output_buffers(parameter.channel_count); + std::array, MaxChannels> input_buffers{}; + std::array, MaxChannels> output_buffers{}; for (u32 i = 0; i < parameter.channel_count; i++) { input_buffers[i] = processor.mix_buffers.subspan(inputs[i] * processor.sample_count, diff --git a/src/audio_core/renderer/command/sink/circular_buffer.cpp b/src/audio_core/renderer/command/sink/circular_buffer.cpp index ded5afc94..e2ce59792 100644 --- a/src/audio_core/renderer/command/sink/circular_buffer.cpp +++ b/src/audio_core/renderer/command/sink/circular_buffer.cpp @@ -24,7 +24,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces constexpr s32 min{std::numeric_limits::min()}; constexpr s32 max{std::numeric_limits::max()}; - std::vector output(processor.sample_count); + std::array output{}; for (u32 channel = 0; channel < input_count; channel++) { auto input{processor.mix_buffers.subspan(inputs[channel] * processor.sample_count, processor.sample_count)}; @@ -33,7 +33,7 @@ void CircularBufferSinkCommand::Process(const ADSP::CommandListProcessor& proces } processor.memory->WriteBlockUnsafe(address + pos, output.data(), - output.size() * sizeof(s16)); + processor.sample_count * sizeof(s16)); pos += static_cast(processor.sample_count * sizeof(s16)); if (pos >= size) { pos = 0; diff --git a/src/audio_core/renderer/command/sink/device.cpp b/src/audio_core/renderer/command/sink/device.cpp index e88372a75..5f74dd7ad 100644 --- a/src/audio_core/renderer/command/sink/device.cpp +++ b/src/audio_core/renderer/command/sink/device.cpp @@ -33,8 +33,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) { .consumed{false}, }; - std::vector samples(out_buffer.frames * input_count); - + std::array samples{}; for (u32 channel = 0; channel < input_count; channel++) { const auto offset{inputs[channel] * out_buffer.frames}; @@ -45,7 +44,7 @@ void DeviceSinkCommand::Process(const ADSP::CommandListProcessor& processor) { } out_buffer.tag = reinterpret_cast(samples.data()); - stream->AppendBuffer(out_buffer, samples); + stream->AppendBuffer(out_buffer, {samples.data(), out_buffer.frames * input_count}); if (stream->IsPaused()) { stream->Start(); diff --git a/src/audio_core/renderer/mix/mix_context.cpp b/src/audio_core/renderer/mix/mix_context.cpp index 35b748ede..3a18ae7c2 100644 --- a/src/audio_core/renderer/mix/mix_context.cpp +++ b/src/audio_core/renderer/mix/mix_context.cpp @@ -125,10 +125,10 @@ bool MixContext::TSortInfo(const SplitterContext& splitter_context) { return false; } - std::vector sorted_results{node_states.GetSortedResuls()}; - const auto result_size{std::min(count, static_cast(sorted_results.size()))}; + auto sorted_results{node_states.GetSortedResuls()}; + const auto result_size{std::min(count, static_cast(sorted_results.second))}; for (s32 i = 0; i < result_size; i++) { - sorted_mix_infos[i] = &mix_infos[sorted_results[i]]; + sorted_mix_infos[i] = &mix_infos[sorted_results.first[i]]; } CalcMixBufferOffset(); diff --git a/src/audio_core/renderer/nodes/node_states.cpp b/src/audio_core/renderer/nodes/node_states.cpp index 1821a51e6..b7a44a54c 100644 --- a/src/audio_core/renderer/nodes/node_states.cpp +++ b/src/audio_core/renderer/nodes/node_states.cpp @@ -134,8 +134,8 @@ u32 NodeStates::GetNodeCount() const { return node_count; } -std::vector NodeStates::GetSortedResuls() const { - return {results.rbegin(), results.rbegin() + result_pos}; +std::pair::reverse_iterator, size_t> NodeStates::GetSortedResuls() const { + return {results.rbegin(), result_pos}; } } // namespace AudioCore::AudioRenderer diff --git a/src/audio_core/renderer/nodes/node_states.h b/src/audio_core/renderer/nodes/node_states.h index 94b1d1254..e768cd4b5 100644 --- a/src/audio_core/renderer/nodes/node_states.h +++ b/src/audio_core/renderer/nodes/node_states.h @@ -175,7 +175,7 @@ public: * * @return Vector of nodes in reverse order. */ - std::vector GetSortedResuls() const; + std::pair::reverse_iterator, size_t> GetSortedResuls() const; private: /// Number of nodes in the graph diff --git a/src/audio_core/renderer/system.cpp b/src/audio_core/renderer/system.cpp index 53b258c4f..a23627472 100644 --- a/src/audio_core/renderer/system.cpp +++ b/src/audio_core/renderer/system.cpp @@ -444,6 +444,7 @@ Result System::Update(std::span input, std::span performance, std: std::scoped_lock l{lock}; const auto start_time{core.CoreTiming().GetClockTicks()}; + std::memset(output.data(), 0, output.size()); InfoUpdater info_updater(input, output, process_handle, behavior); diff --git a/src/audio_core/sink/null_sink.h b/src/audio_core/sink/null_sink.h index 1215d3cd2..b6b43c93e 100644 --- a/src/audio_core/sink/null_sink.h +++ b/src/audio_core/sink/null_sink.h @@ -20,7 +20,7 @@ public: explicit NullSinkStreamImpl(Core::System& system_, StreamType type_) : SinkStream{system_, type_} {} ~NullSinkStreamImpl() override {} - void AppendBuffer(SinkBuffer&, std::vector&) override {} + void AppendBuffer(SinkBuffer&, std::span) override {} std::vector ReleaseBuffer(u64) override { return {}; } diff --git a/src/audio_core/sink/sink_stream.cpp b/src/audio_core/sink/sink_stream.cpp index 9a718a9cc..404dcd0e9 100644 --- a/src/audio_core/sink/sink_stream.cpp +++ b/src/audio_core/sink/sink_stream.cpp @@ -18,7 +18,7 @@ namespace AudioCore::Sink { -void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { +void SinkStream::AppendBuffer(SinkBuffer& buffer, std::span samples) { if (type == StreamType::In) { queue.enqueue(buffer); queued_buffers++; @@ -66,15 +66,16 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { static_cast(std::clamp(right_sample, min, max)); } - samples.resize(samples.size() / system_channels * device_channels); + samples = samples.subspan(0, samples.size() / system_channels * device_channels); } else if (system_channels == 2 && device_channels == 6) { // We need moar samples! Not all games will provide 6 channel audio. // TODO: Implement some upmixing here. Currently just passthrough, with other // channels left as silence. - std::vector new_samples(samples.size() / system_channels * device_channels, 0); + auto new_size = samples.size() / system_channels * device_channels; + tmp_samples.resize_destructive(new_size); - for (u32 read_index = 0, write_index = 0; read_index < samples.size(); + for (u32 read_index = 0, write_index = 0; read_index < new_size; read_index += system_channels, write_index += device_channels) { const auto left_sample{static_cast(std::clamp( static_cast( @@ -82,7 +83,7 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { volume), min, max))}; - new_samples[write_index + static_cast(Channels::FrontLeft)] = left_sample; + tmp_samples[write_index + static_cast(Channels::FrontLeft)] = left_sample; const auto right_sample{static_cast(std::clamp( static_cast( @@ -90,9 +91,9 @@ void SinkStream::AppendBuffer(SinkBuffer& buffer, std::vector& samples) { volume), min, max))}; - new_samples[write_index + static_cast(Channels::FrontRight)] = right_sample; + tmp_samples[write_index + static_cast(Channels::FrontRight)] = right_sample; } - samples = std::move(new_samples); + samples = std::span(tmp_samples); } else if (volume != 1.0f) { for (u32 i = 0; i < samples.size(); i++) { diff --git a/src/audio_core/sink/sink_stream.h b/src/audio_core/sink/sink_stream.h index 41cbadc9c..98d72ace1 100644 --- a/src/audio_core/sink/sink_stream.h +++ b/src/audio_core/sink/sink_stream.h @@ -16,6 +16,7 @@ #include "common/polyfill_thread.h" #include "common/reader_writer_queue.h" #include "common/ring_buffer.h" +#include "common/scratch_buffer.h" #include "common/thread.h" namespace Core { @@ -170,7 +171,7 @@ public: * @param buffer - Audio buffer information to be queued. * @param samples - The s16 samples to be queue for playback. */ - virtual void AppendBuffer(SinkBuffer& buffer, std::vector& samples); + virtual void AppendBuffer(SinkBuffer& buffer, std::span samples); /** * Release a buffer. Audio In only, will fill a buffer with recorded samples. @@ -255,6 +256,8 @@ private: /// Signalled when ring buffer entries are consumed std::condition_variable_any release_cv; std::mutex release_mutex; + /// Temporary buffer for appending samples when upmixing + Common::ScratchBuffer tmp_samples{}; }; using SinkStreamPtr = std::unique_ptr; diff --git a/src/common/ring_buffer.h b/src/common/ring_buffer.h index 4c328ab44..416680d44 100644 --- a/src/common/ring_buffer.h +++ b/src/common/ring_buffer.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ public: return push_count; } - std::size_t Push(const std::vector& input) { + std::size_t Push(const std::span input) { return Push(input.data(), input.size()); } diff --git a/src/common/scratch_buffer.h b/src/common/scratch_buffer.h index a69a5a7af..6fe907953 100644 --- a/src/common/scratch_buffer.h +++ b/src/common/scratch_buffer.h @@ -3,6 +3,9 @@ #pragma once +#include + +#include "common/concepts.h" #include "common/make_unique_for_overwrite.h" namespace Common { @@ -16,6 +19,12 @@ namespace Common { template class ScratchBuffer { public: + using iterator = T*; + using const_iterator = const T*; + using value_type = T; + using element_type = T; + using iterator_category = std::contiguous_iterator_tag; + ScratchBuffer() = default; explicit ScratchBuffer(size_t initial_capacity) diff --git a/src/core/hle/kernel/k_synchronization_object.cpp b/src/core/hle/kernel/k_synchronization_object.cpp index b7da3eee7..3e5b735b1 100644 --- a/src/core/hle/kernel/k_synchronization_object.cpp +++ b/src/core/hle/kernel/k_synchronization_object.cpp @@ -3,6 +3,7 @@ #include "common/assert.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "core/hle/kernel/k_scheduler.h" #include "core/hle/kernel/k_scoped_scheduler_lock_and_sleep.h" #include "core/hle/kernel/k_synchronization_object.h" @@ -75,7 +76,7 @@ Result KSynchronizationObject::Wait(KernelCore& kernel, s32* out_index, KSynchronizationObject** objects, const s32 num_objects, s64 timeout) { // Allocate space on stack for thread nodes. - std::vector thread_nodes(num_objects); + std::array thread_nodes; // Prepare for wait. KThread* thread = GetCurrentThreadPointer(kernel); diff --git a/src/core/hle/kernel/k_thread.cpp b/src/core/hle/kernel/k_thread.cpp index 908811e2c..adb6ec581 100644 --- a/src/core/hle/kernel/k_thread.cpp +++ b/src/core/hle/kernel/k_thread.cpp @@ -909,7 +909,7 @@ Result KThread::SetActivity(Svc::ThreadActivity activity) { R_SUCCEED(); } -Result KThread::GetThreadContext3(std::vector& out) { +Result KThread::GetThreadContext3(Common::ScratchBuffer& out) { // Lock ourselves. KScopedLightLock lk{m_activity_pause_lock}; @@ -927,15 +927,13 @@ Result KThread::GetThreadContext3(std::vector& out) { // Mask away mode bits, interrupt bits, IL bit, and other reserved bits. auto context = GetContext64(); context.pstate &= 0xFF0FFE20; - - out.resize(sizeof(context)); + out.resize_destructive(sizeof(context)); std::memcpy(out.data(), std::addressof(context), sizeof(context)); } else { // Mask away mode bits, interrupt bits, IL bit, and other reserved bits. auto context = GetContext32(); context.cpsr &= 0xFF0FFE20; - - out.resize(sizeof(context)); + out.resize_destructive(sizeof(context)); std::memcpy(out.data(), std::addressof(context), sizeof(context)); } } diff --git a/src/core/hle/kernel/k_thread.h b/src/core/hle/kernel/k_thread.h index 37fe5db77..dd662b3f8 100644 --- a/src/core/hle/kernel/k_thread.h +++ b/src/core/hle/kernel/k_thread.h @@ -15,6 +15,7 @@ #include "common/intrusive_list.h" #include "common/intrusive_red_black_tree.h" +#include "common/scratch_buffer.h" #include "common/spin_lock.h" #include "core/arm/arm_interface.h" #include "core/hle/kernel/k_affinity_mask.h" @@ -567,7 +568,7 @@ public: void RemoveWaiter(KThread* thread); - Result GetThreadContext3(std::vector& out); + Result GetThreadContext3(Common::ScratchBuffer& out); KThread* RemoveUserWaiterByKey(bool* out_has_waiters, KProcessAddress key) { return this->RemoveWaiterByKey(out_has_waiters, key, false); diff --git a/src/core/hle/kernel/svc/svc_ipc.cpp b/src/core/hle/kernel/svc/svc_ipc.cpp index ea03068aa..60247df2e 100644 --- a/src/core/hle/kernel/svc/svc_ipc.cpp +++ b/src/core/hle/kernel/svc/svc_ipc.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/scope_exit.h" +#include "common/scratch_buffer.h" #include "core/core.h" #include "core/hle/kernel/k_client_session.h" #include "core/hle/kernel/k_process.h" @@ -45,11 +46,11 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad handles_addr, static_cast(sizeof(Handle) * num_handles)), ResultInvalidPointer); - std::vector handles(num_handles); + std::array handles; GetCurrentMemory(kernel).ReadBlock(handles_addr, handles.data(), sizeof(Handle) * num_handles); // Convert handle list to object table. - std::vector objs(num_handles); + std::array objs; R_UNLESS(handle_table.GetMultipleObjects(objs.data(), handles.data(), num_handles), ResultInvalidHandle); @@ -80,7 +81,7 @@ Result ReplyAndReceive(Core::System& system, s32* out_index, uint64_t handles_ad // Wait for an object. s32 index; Result result = KSynchronizationObject::Wait(kernel, std::addressof(index), objs.data(), - static_cast(objs.size()), timeout_ns); + num_handles, timeout_ns); if (result == ResultTimedOut) { R_RETURN(result); } diff --git a/src/core/hle/kernel/svc/svc_synchronization.cpp b/src/core/hle/kernel/svc/svc_synchronization.cpp index 04d65f0bd..53df5bcd8 100644 --- a/src/core/hle/kernel/svc/svc_synchronization.cpp +++ b/src/core/hle/kernel/svc/svc_synchronization.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/scope_exit.h" +#include "common/scratch_buffer.h" #include "core/core.h" #include "core/hle/kernel/k_process.h" #include "core/hle/kernel/k_readable_event.h" @@ -54,7 +55,7 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons // Get the synchronization context. auto& kernel = system.Kernel(); auto& handle_table = GetCurrentProcess(kernel).GetHandleTable(); - std::vector objs(num_handles); + std::array objs; // Copy user handles. if (num_handles > 0) { @@ -72,8 +73,8 @@ static Result WaitSynchronization(Core::System& system, int32_t* out_index, cons }); // Wait on the objects. - Result res = KSynchronizationObject::Wait(kernel, out_index, objs.data(), - static_cast(objs.size()), timeout_ns); + Result res = + KSynchronizationObject::Wait(kernel, out_index, objs.data(), num_handles, timeout_ns); R_SUCCEED_IF(res == ResultSessionClosed); R_RETURN(res); @@ -87,8 +88,7 @@ Result WaitSynchronization(Core::System& system, int32_t* out_index, u64 user_ha // Ensure number of handles is valid. R_UNLESS(0 <= num_handles && num_handles <= Svc::ArgumentHandleCountMax, ResultOutOfRange); - - std::vector handles(num_handles); + std::array handles; if (num_handles > 0) { GetCurrentMemory(system.Kernel()) .ReadBlock(user_handles, handles.data(), num_handles * sizeof(Handle)); diff --git a/src/core/hle/kernel/svc/svc_thread.cpp b/src/core/hle/kernel/svc/svc_thread.cpp index 37b54079c..36b94e6bf 100644 --- a/src/core/hle/kernel/svc/svc_thread.cpp +++ b/src/core/hle/kernel/svc/svc_thread.cpp @@ -174,7 +174,7 @@ Result GetThreadContext3(Core::System& system, u64 out_context, Handle thread_ha } // Get the thread context. - std::vector context; + static thread_local Common::ScratchBuffer context; R_TRY(thread->GetThreadContext3(context)); // Copy the thread context to user space. diff --git a/src/core/hle/service/audio/audin_u.cpp b/src/core/hle/service/audio/audin_u.cpp index f0640c64f..c8d574993 100644 --- a/src/core/hle/service/audio/audin_u.cpp +++ b/src/core/hle/service/audio/audin_u.cpp @@ -5,6 +5,7 @@ #include "audio_core/renderer/audio_device.h" #include "common/common_funcs.h" #include "common/logging/log.h" +#include "common/settings.h" #include "common/string_util.h" #include "core/core.h" #include "core/hle/kernel/k_event.h" @@ -123,19 +124,13 @@ private: void GetReleasedAudioInBuffer(HLERequestContext& ctx) { const auto write_buffer_size = ctx.GetWriteBufferNumElements(); - std::vector released_buffers(write_buffer_size); + tmp_buffer.resize_destructive(write_buffer_size); + tmp_buffer[0] = 0; - const auto count = impl->GetReleasedBuffers(released_buffers); + const auto count = impl->GetReleasedBuffers(tmp_buffer); - [[maybe_unused]] std::string tags{}; - for (u32 i = 0; i < count; i++) { - tags += fmt::format("{:08X}, ", released_buffers[i]); - } - [[maybe_unused]] auto sessionid{impl->GetSystem().GetSessionId()}; - LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count, - tags); + ctx.WriteBuffer(tmp_buffer); - ctx.WriteBuffer(released_buffers); IPC::ResponseBuilder rb{ctx, 3}; rb.Push(ResultSuccess); rb.Push(count); @@ -200,6 +195,7 @@ private: KernelHelpers::ServiceContext service_context; Kernel::KEvent* event; std::shared_ptr impl; + Common::ScratchBuffer tmp_buffer; }; AudInU::AudInU(Core::System& system_) diff --git a/src/core/hle/service/audio/audout_u.cpp b/src/core/hle/service/audio/audout_u.cpp index 3e62fa4fc..032c8c11f 100644 --- a/src/core/hle/service/audio/audout_u.cpp +++ b/src/core/hle/service/audio/audout_u.cpp @@ -123,19 +123,13 @@ private: void GetReleasedAudioOutBuffers(HLERequestContext& ctx) { const auto write_buffer_size = ctx.GetWriteBufferNumElements(); - std::vector released_buffers(write_buffer_size); + tmp_buffer.resize_destructive(write_buffer_size); + tmp_buffer[0] = 0; - const auto count = impl->GetReleasedBuffers(released_buffers); + const auto count = impl->GetReleasedBuffers(tmp_buffer); - [[maybe_unused]] std::string tags{}; - for (u32 i = 0; i < count; i++) { - tags += fmt::format("{:08X}, ", released_buffers[i]); - } - [[maybe_unused]] const auto sessionid{impl->GetSystem().GetSessionId()}; - LOG_TRACE(Service_Audio, "called. Session {} released {} buffers: {}", sessionid, count, - tags); + ctx.WriteBuffer(tmp_buffer); - ctx.WriteBuffer(released_buffers); IPC::ResponseBuilder rb{ctx, 3}; rb.Push(ResultSuccess); rb.Push(count); @@ -211,6 +205,7 @@ private: KernelHelpers::ServiceContext service_context; Kernel::KEvent* event; std::shared_ptr impl; + Common::ScratchBuffer tmp_buffer; }; AudOutU::AudOutU(Core::System& system_) diff --git a/src/core/hle/service/audio/audren_u.cpp b/src/core/hle/service/audio/audren_u.cpp index 7086d4750..12845c23a 100644 --- a/src/core/hle/service/audio/audren_u.cpp +++ b/src/core/hle/service/audio/audren_u.cpp @@ -116,28 +116,26 @@ private: // These buffers are written manually to avoid an issue with WriteBuffer throwing errors for // checking size 0. Performance size is 0 for most games. - std::vector output{}; - std::vector performance{}; auto is_buffer_b{ctx.BufferDescriptorB()[0].Size() != 0}; if (is_buffer_b) { const auto buffersB{ctx.BufferDescriptorB()}; - output.resize(buffersB[0].Size(), 0); - performance.resize(buffersB[1].Size(), 0); + tmp_output.resize_destructive(buffersB[0].Size()); + tmp_performance.resize_destructive(buffersB[1].Size()); } else { const auto buffersC{ctx.BufferDescriptorC()}; - output.resize(buffersC[0].Size(), 0); - performance.resize(buffersC[1].Size(), 0); + tmp_output.resize_destructive(buffersC[0].Size()); + tmp_performance.resize_destructive(buffersC[1].Size()); } - auto result = impl->RequestUpdate(input, performance, output); + auto result = impl->RequestUpdate(input, tmp_performance, tmp_output); if (result.IsSuccess()) { if (is_buffer_b) { - ctx.WriteBufferB(output.data(), output.size(), 0); - ctx.WriteBufferB(performance.data(), performance.size(), 1); + ctx.WriteBufferB(tmp_output.data(), tmp_output.size(), 0); + ctx.WriteBufferB(tmp_performance.data(), tmp_performance.size(), 1); } else { - ctx.WriteBufferC(output.data(), output.size(), 0); - ctx.WriteBufferC(performance.data(), performance.size(), 1); + ctx.WriteBufferC(tmp_output.data(), tmp_output.size(), 0); + ctx.WriteBufferC(tmp_performance.data(), tmp_performance.size(), 1); } } else { LOG_ERROR(Service_Audio, "RequestUpdate failed error 0x{:02X}!", result.description); @@ -235,6 +233,8 @@ private: Kernel::KEvent* rendered_event; Manager& manager; std::unique_ptr impl; + Common::ScratchBuffer tmp_output; + Common::ScratchBuffer tmp_performance; }; class IAudioDevice final : public ServiceFramework { diff --git a/src/core/hle/service/audio/audren_u.h b/src/core/hle/service/audio/audren_u.h index 24ce37e87..d8e9c8719 100644 --- a/src/core/hle/service/audio/audren_u.h +++ b/src/core/hle/service/audio/audren_u.h @@ -4,6 +4,7 @@ #pragma once #include "audio_core/audio_render_manager.h" +#include "common/scratch_buffer.h" #include "core/hle/service/kernel_helpers.h" #include "core/hle/service/service.h" diff --git a/src/core/hle/service/audio/hwopus.cpp b/src/core/hle/service/audio/hwopus.cpp index 451ac224a..c835f6cb7 100644 --- a/src/core/hle/service/audio/hwopus.cpp +++ b/src/core/hle/service/audio/hwopus.cpp @@ -68,13 +68,13 @@ private: ExtraBehavior extra_behavior) { u32 consumed = 0; u32 sample_count = 0; - std::vector samples(ctx.GetWriteBufferNumElements()); + tmp_samples.resize_destructive(ctx.GetWriteBufferNumElements()); if (extra_behavior == ExtraBehavior::ResetContext) { ResetDecoderContext(); } - if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), samples, performance)) { + if (!DecodeOpusData(consumed, sample_count, ctx.ReadBuffer(), tmp_samples, performance)) { LOG_ERROR(Audio, "Failed to decode opus data"); IPC::ResponseBuilder rb{ctx, 2}; // TODO(ogniK): Use correct error code @@ -90,11 +90,11 @@ private: if (performance) { rb.Push(*performance); } - ctx.WriteBuffer(samples); + ctx.WriteBuffer(tmp_samples); } bool DecodeOpusData(u32& consumed, u32& sample_count, std::span input, - std::vector& output, u64* out_performance_time) const { + std::span output, u64* out_performance_time) const { const auto start_time = std::chrono::steady_clock::now(); const std::size_t raw_output_sz = output.size() * sizeof(opus_int16); if (sizeof(OpusPacketHeader) > input.size()) { @@ -154,6 +154,7 @@ private: OpusDecoderPtr decoder; u32 sample_rate; u32 channel_count; + Common::ScratchBuffer tmp_samples; }; class IHardwareOpusDecoderManager final : public ServiceFramework { diff --git a/src/core/hle/service/nvdrv/devices/nvdevice.h b/src/core/hle/service/nvdrv/devices/nvdevice.h index ab1f30f9e..a04538d5d 100644 --- a/src/core/hle/service/nvdrv/devices/nvdevice.h +++ b/src/core/hle/service/nvdrv/devices/nvdevice.h @@ -34,7 +34,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) = 0; + std::span output) = 0; /** * Handles an ioctl2 request. @@ -45,7 +45,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) = 0; + std::span inline_input, std::span output) = 0; /** * Handles an ioctl3 request. @@ -56,7 +56,7 @@ public: * @returns The result code of the ioctl. */ virtual NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) = 0; + std::span output, std::span inline_output) = 0; /** * Called once a device is opened diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp index 0fe242e9d..05a43d8dc 100644 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp @@ -18,19 +18,19 @@ nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core) nvdisp_disp0::~nvdisp_disp0() = default; NvResult nvdisp_disp0::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvdisp_disp0::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvdisp_disp0::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h index bcd0e3ed5..daee05fe8 100644 --- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h +++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h @@ -26,11 +26,11 @@ public: ~nvdisp_disp0() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp index 681bd0867..07e570a9f 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp @@ -28,7 +28,7 @@ nvhost_as_gpu::nvhost_as_gpu(Core::System& system_, Module& module_, NvCore::Con nvhost_as_gpu::~nvhost_as_gpu() = default; NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'A': switch (command.cmd) { @@ -61,13 +61,13 @@ NvResult nvhost_as_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span i } NvResult nvhost_as_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { switch (command.group) { case 'A': switch (command.cmd) { @@ -87,7 +87,7 @@ NvResult nvhost_as_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span i void nvhost_as_gpu::OnOpen(DeviceFD fd) {} void nvhost_as_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::span output) { IoctlAllocAsEx params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -141,7 +141,7 @@ NvResult nvhost_as_gpu::AllocAsEx(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_as_gpu::AllocateSpace(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::AllocateSpace(std::span input, std::span output) { IoctlAllocSpace params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -220,7 +220,7 @@ void nvhost_as_gpu::FreeMappingLocked(u64 offset) { mapping_map.erase(offset); } -NvResult nvhost_as_gpu::FreeSpace(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::FreeSpace(std::span input, std::span output) { IoctlFreeSpace params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -266,15 +266,14 @@ NvResult nvhost_as_gpu::FreeSpace(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_as_gpu::Remap(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::Remap(std::span input, std::span output) { const auto num_entries = input.size() / sizeof(IoctlRemapEntry); LOG_DEBUG(Service_NVDRV, "called, num_entries=0x{:X}", num_entries); - std::vector entries(num_entries); - std::memcpy(entries.data(), input.data(), input.size()); - std::scoped_lock lock(mutex); + entries.resize_destructive(num_entries); + std::memcpy(entries.data(), input.data(), input.size()); if (!vm.initialised) { return NvResult::BadValue; @@ -320,7 +319,7 @@ NvResult nvhost_as_gpu::Remap(std::span input, std::vector& output return NvResult::Success; } -NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::span output) { IoctlMapBufferEx params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -424,7 +423,7 @@ NvResult nvhost_as_gpu::MapBufferEx(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::span output) { IoctlUnmapBuffer params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -463,7 +462,7 @@ NvResult nvhost_as_gpu::UnmapBuffer(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::BindChannel(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::BindChannel(std::span input, std::span output) { IoctlBindChannel params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={:X}", params.fd); @@ -492,7 +491,7 @@ void nvhost_as_gpu::GetVARegionsImpl(IoctlGetVaRegions& params) { }; } -NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& output) { +NvResult nvhost_as_gpu::GetVARegions(std::span input, std::span output) { IoctlGetVaRegions params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -511,8 +510,8 @@ NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_as_gpu::GetVARegions(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_as_gpu::GetVARegions(std::span input, std::span output, + std::span inline_output) { IoctlGetVaRegions params{}; std::memcpy(¶ms, input.data(), input.size()); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h index 1aba8d579..2af3e1260 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h @@ -15,6 +15,7 @@ #include "common/address_space.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "common/scratch_buffer.h" #include "common/swap.h" #include "core/hle/service/nvdrv/core/nvmap.h" #include "core/hle/service/nvdrv/devices/nvdevice.h" @@ -48,11 +49,11 @@ public: ~nvhost_as_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -138,18 +139,18 @@ private: static_assert(sizeof(IoctlGetVaRegions) == 16 + sizeof(VaRegion) * 2, "IoctlGetVaRegions is incorrect size"); - NvResult AllocAsEx(std::span input, std::vector& output); - NvResult AllocateSpace(std::span input, std::vector& output); - NvResult Remap(std::span input, std::vector& output); - NvResult MapBufferEx(std::span input, std::vector& output); - NvResult UnmapBuffer(std::span input, std::vector& output); - NvResult FreeSpace(std::span input, std::vector& output); - NvResult BindChannel(std::span input, std::vector& output); + NvResult AllocAsEx(std::span input, std::span output); + NvResult AllocateSpace(std::span input, std::span output); + NvResult Remap(std::span input, std::span output); + NvResult MapBufferEx(std::span input, std::span output); + NvResult UnmapBuffer(std::span input, std::span output); + NvResult FreeSpace(std::span input, std::span output); + NvResult BindChannel(std::span input, std::span output); void GetVARegionsImpl(IoctlGetVaRegions& params); - NvResult GetVARegions(std::span input, std::vector& output); - NvResult GetVARegions(std::span input, std::vector& output, - std::vector& inline_output); + NvResult GetVARegions(std::span input, std::span output); + NvResult GetVARegions(std::span input, std::span output, + std::span inline_output); void FreeMappingLocked(u64 offset); @@ -212,6 +213,7 @@ private: bool initialised{}; } vm; std::shared_ptr gmmu; + Common::ScratchBuffer entries; // s32 channel{}; // u32 big_page_size{VM::DEFAULT_BIG_PAGE_SIZE}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp index e12025560..4d55554b4 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp @@ -35,7 +35,7 @@ nvhost_ctrl::~nvhost_ctrl() { } NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -64,13 +64,13 @@ NvResult nvhost_ctrl::Ioctl1(DeviceFD fd, Ioctl command, std::span inp } NvResult nvhost_ctrl::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_ctrl::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_outpu) { + std::span output, std::span inline_outpu) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -79,7 +79,7 @@ void nvhost_ctrl::OnOpen(DeviceFD fd) {} void nvhost_ctrl::OnClose(DeviceFD fd) {} -NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::vector& output) { +NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::span output) { IocGetConfigParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_TRACE(Service_NVDRV, "called, setting={}!{}", params.domain_str.data(), @@ -87,7 +87,7 @@ NvResult nvhost_ctrl::NvOsGetConfigU32(std::span input, std::vector input, std::vector& output, +NvResult nvhost_ctrl::IocCtrlEventWait(std::span input, std::span output, bool is_allocation) { IocCtrlEventWaitParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -231,7 +231,7 @@ NvResult nvhost_ctrl::FreeEvent(u32 slot) { return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::span output) { IocCtrlEventRegisterParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); const u32 event_id = params.user_event_id; @@ -252,7 +252,7 @@ NvResult nvhost_ctrl::IocCtrlEventRegister(std::span input, std::vecto return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::span output) { IocCtrlEventUnregisterParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); const u32 event_id = params.user_event_id & 0x00FF; @@ -262,8 +262,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregister(std::span input, std::vec return FreeEvent(event_id); } -NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, - std::vector& output) { +NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, std::span output) { IocCtrlEventUnregisterBatchParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); u64 event_mask = params.user_events; @@ -281,7 +280,7 @@ NvResult nvhost_ctrl::IocCtrlEventUnregisterBatch(std::span input, return NvResult::Success; } -NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span input, std::vector& output) { +NvResult nvhost_ctrl::IocCtrlClearEventWait(std::span input, std::span output) { IocCtrlEventClearParams params{}; std::memcpy(¶ms, input.data(), sizeof(params)); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h index dd2e7888a..2efed4862 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h @@ -26,11 +26,11 @@ public: ~nvhost_ctrl() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -186,13 +186,12 @@ private: static_assert(sizeof(IocCtrlEventUnregisterBatchParams) == 8, "IocCtrlEventKill is incorrect size"); - NvResult NvOsGetConfigU32(std::span input, std::vector& output); - NvResult IocCtrlEventWait(std::span input, std::vector& output, - bool is_allocation); - NvResult IocCtrlEventRegister(std::span input, std::vector& output); - NvResult IocCtrlEventUnregister(std::span input, std::vector& output); - NvResult IocCtrlEventUnregisterBatch(std::span input, std::vector& output); - NvResult IocCtrlClearEventWait(std::span input, std::vector& output); + NvResult NvOsGetConfigU32(std::span input, std::span output); + NvResult IocCtrlEventWait(std::span input, std::span output, bool is_allocation); + NvResult IocCtrlEventRegister(std::span input, std::span output); + NvResult IocCtrlEventUnregister(std::span input, std::span output); + NvResult IocCtrlEventUnregisterBatch(std::span input, std::span output); + NvResult IocCtrlClearEventWait(std::span input, std::span output); NvResult FreeEvent(u32 slot); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp index be3c083db..6081d92e9 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp @@ -22,7 +22,7 @@ nvhost_ctrl_gpu::~nvhost_ctrl_gpu() { } NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'G': switch (command.cmd) { @@ -54,13 +54,13 @@ NvResult nvhost_ctrl_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span } NvResult nvhost_ctrl_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { switch (command.group) { case 'G': switch (command.cmd) { @@ -82,7 +82,7 @@ NvResult nvhost_ctrl_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span void nvhost_ctrl_gpu::OnOpen(DeviceFD fd) {} void nvhost_ctrl_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlCharacteristics params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -127,8 +127,8 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vec return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::span output, + std::span inline_output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlCharacteristics params{}; std::memcpy(¶ms, input.data(), input.size()); @@ -175,7 +175,7 @@ NvResult nvhost_ctrl_gpu::GetCharacteristics(std::span input, std::vec return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::span output) { IoctlGpuGetTpcMasksArgs params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); @@ -186,8 +186,8 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector& output, - std::vector& inline_output) { +NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::span output, + std::span inline_output) { IoctlGpuGetTpcMasksArgs params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size); @@ -199,7 +199,7 @@ NvResult nvhost_ctrl_gpu::GetTPCMasks(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlActiveSlotMask params{}; @@ -212,7 +212,7 @@ NvResult nvhost_ctrl_gpu::GetActiveSlotMask(std::span input, std::vect return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlZcullGetCtxSize params{}; @@ -224,7 +224,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetCtxSize(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlNvgpuGpuZcullGetInfoArgs params{}; @@ -247,7 +247,7 @@ NvResult nvhost_ctrl_gpu::ZCullGetInfo(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlZbcSetTable params{}; @@ -263,7 +263,7 @@ NvResult nvhost_ctrl_gpu::ZBCSetTable(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlZbcQueryTable params{}; @@ -273,7 +273,7 @@ NvResult nvhost_ctrl_gpu::ZBCQueryTable(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_ctrl_gpu::FlushL2(std::span input, std::span output) { LOG_WARNING(Service_NVDRV, "(STUBBED) called"); IoctlFlushL2 params{}; @@ -283,7 +283,7 @@ NvResult nvhost_ctrl_gpu::FlushL2(std::span input, std::vector& ou return NvResult::Success; } -NvResult nvhost_ctrl_gpu::GetGpuTime(std::span input, std::vector& output) { +NvResult nvhost_ctrl_gpu::GetGpuTime(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlGetGpuTime params{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h index b9333d9d3..97995551c 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h @@ -22,11 +22,11 @@ public: ~nvhost_ctrl_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -151,21 +151,21 @@ private: }; static_assert(sizeof(IoctlGetGpuTime) == 0x10, "IoctlGetGpuTime is incorrect size"); - NvResult GetCharacteristics(std::span input, std::vector& output); - NvResult GetCharacteristics(std::span input, std::vector& output, - std::vector& inline_output); - - NvResult GetTPCMasks(std::span input, std::vector& output); - NvResult GetTPCMasks(std::span input, std::vector& output, - std::vector& inline_output); - - NvResult GetActiveSlotMask(std::span input, std::vector& output); - NvResult ZCullGetCtxSize(std::span input, std::vector& output); - NvResult ZCullGetInfo(std::span input, std::vector& output); - NvResult ZBCSetTable(std::span input, std::vector& output); - NvResult ZBCQueryTable(std::span input, std::vector& output); - NvResult FlushL2(std::span input, std::vector& output); - NvResult GetGpuTime(std::span input, std::vector& output); + NvResult GetCharacteristics(std::span input, std::span output); + NvResult GetCharacteristics(std::span input, std::span output, + std::span inline_output); + + NvResult GetTPCMasks(std::span input, std::span output); + NvResult GetTPCMasks(std::span input, std::span output, + std::span inline_output); + + NvResult GetActiveSlotMask(std::span input, std::span output); + NvResult ZCullGetCtxSize(std::span input, std::span output); + NvResult ZCullGetInfo(std::span input, std::span output); + NvResult ZBCSetTable(std::span input, std::span output); + NvResult ZBCQueryTable(std::span input, std::span output); + NvResult FlushL2(std::span input, std::span output); + NvResult GetGpuTime(std::span input, std::span output); EventInterface& events_interface; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp index 453a965dc..46a25fcab 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp @@ -47,7 +47,7 @@ nvhost_gpu::~nvhost_gpu() { } NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -99,7 +99,7 @@ NvResult nvhost_gpu::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu }; NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { switch (command.group) { case 'H': switch (command.cmd) { @@ -113,7 +113,7 @@ NvResult nvhost_gpu::Ioctl2(DeviceFD fd, Ioctl command, std::span inpu } NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -121,7 +121,7 @@ NvResult nvhost_gpu::Ioctl3(DeviceFD fd, Ioctl command, std::span inpu void nvhost_gpu::OnOpen(DeviceFD fd) {} void nvhost_gpu::OnClose(DeviceFD fd) {} -NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::span output) { IoctlSetNvmapFD params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); @@ -130,7 +130,7 @@ NvResult nvhost_gpu::SetNVMAPfd(std::span input, std::vector& outp return NvResult::Success; } -NvResult nvhost_gpu::SetClientData(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetClientData(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlClientData params{}; @@ -139,7 +139,7 @@ NvResult nvhost_gpu::SetClientData(std::span input, std::vector& o return NvResult::Success; } -NvResult nvhost_gpu::GetClientData(std::span input, std::vector& output) { +NvResult nvhost_gpu::GetClientData(std::span input, std::span output) { LOG_DEBUG(Service_NVDRV, "called"); IoctlClientData params{}; @@ -149,7 +149,7 @@ NvResult nvhost_gpu::GetClientData(std::span input, std::vector& o return NvResult::Success; } -NvResult nvhost_gpu::ZCullBind(std::span input, std::vector& output) { +NvResult nvhost_gpu::ZCullBind(std::span input, std::span output) { std::memcpy(&zcull_params, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, gpu_va={:X}, mode={:X}", zcull_params.gpu_va, zcull_params.mode); @@ -158,7 +158,7 @@ NvResult nvhost_gpu::ZCullBind(std::span input, std::vector& outpu return NvResult::Success; } -NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::span output) { IoctlSetErrorNotifier params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called, offset={:X}, size={:X}, mem={:X}", params.offset, @@ -168,14 +168,14 @@ NvResult nvhost_gpu::SetErrorNotifier(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_gpu::SetChannelPriority(std::span input, std::vector& output) { +NvResult nvhost_gpu::SetChannelPriority(std::span input, std::span output) { std::memcpy(&channel_priority, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "(STUBBED) called, priority={:X}", channel_priority); return NvResult::Success; } -NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::vector& output) { +NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::span output) { IoctlAllocGpfifoEx2 params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, @@ -197,7 +197,7 @@ NvResult nvhost_gpu::AllocGPFIFOEx2(std::span input, std::vector& return NvResult::Success; } -NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::vector& output) { +NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::span output) { IoctlAllocObjCtx params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called, class_num={:X}, flags={:X}", params.class_num, @@ -208,7 +208,8 @@ NvResult nvhost_gpu::AllocateObjectContext(std::span input, std::vecto return NvResult::Success; } -static std::vector BuildWaitCommandList(NvFence fence) { +static boost::container::small_vector BuildWaitCommandList( + NvFence fence) { return { Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), @@ -219,35 +220,35 @@ static std::vector BuildWaitCommandList(NvFence fence) { }; } -static std::vector BuildIncrementCommandList(NvFence fence) { - std::vector result{ +static boost::container::small_vector BuildIncrementCommandList( + NvFence fence) { + boost::container::small_vector result{ Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointPayload, 1, Tegra::SubmissionMode::Increasing), {}}; for (u32 count = 0; count < 2; ++count) { - result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, - Tegra::SubmissionMode::Increasing)); - result.emplace_back( + result.push_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::SyncpointOperation, 1, + Tegra::SubmissionMode::Increasing)); + result.push_back( BuildFenceAction(Tegra::Engines::Puller::FenceOperation::Increment, fence.id)); } return result; } -static std::vector BuildIncrementWithWfiCommandList(NvFence fence) { - std::vector result{ +static boost::container::small_vector BuildIncrementWithWfiCommandList( + NvFence fence) { + boost::container::small_vector result{ Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForIdle, 1, Tegra::SubmissionMode::Increasing), {}}; - const std::vector increment{BuildIncrementCommandList(fence)}; - - result.insert(result.end(), increment.begin(), increment.end()); - + auto increment_list{BuildIncrementCommandList(fence)}; + result.insert(result.end(), increment_list.begin(), increment_list.end()); return result; } -NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector& output, +NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span output, Tegra::CommandList&& entries) { LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address, params.num_entries, params.flags.raw); @@ -293,7 +294,7 @@ NvResult nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector return NvResult::Success; } -NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::vector& output, +NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span output, bool kickoff) { if (input.size() < sizeof(IoctlSubmitGpfifo)) { UNIMPLEMENTED(); @@ -315,7 +316,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::vector } NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span input_inline, - std::vector& output) { + std::span output) { if (input.size() < sizeof(IoctlSubmitGpfifo)) { UNIMPLEMENTED(); return NvResult::InvalidSize; @@ -327,7 +328,7 @@ NvResult nvhost_gpu::SubmitGPFIFOBase(std::span input, std::span input, std::vector& output) { +NvResult nvhost_gpu::GetWaitbase(std::span input, std::span output) { IoctlGetWaitbase params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown); @@ -337,7 +338,7 @@ NvResult nvhost_gpu::GetWaitbase(std::span input, std::vector& out return NvResult::Success; } -NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::vector& output) { +NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::span output) { IoctlChannelSetTimeout params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlChannelSetTimeout)); LOG_INFO(Service_NVDRV, "called, timeout=0x{:X}", params.timeout); @@ -345,7 +346,7 @@ NvResult nvhost_gpu::ChannelSetTimeout(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_gpu::ChannelSetTimeslice(std::span input, std::span output) { IoctlSetTimeslice params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSetTimeslice)); LOG_INFO(Service_NVDRV, "called, timeslice=0x{:X}", params.timeslice); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h index 3ca58202d..529c20526 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h @@ -41,11 +41,11 @@ public: ~nvhost_gpu() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -186,23 +186,23 @@ private: u32_le channel_priority{}; u32_le channel_timeslice{}; - NvResult SetNVMAPfd(std::span input, std::vector& output); - NvResult SetClientData(std::span input, std::vector& output); - NvResult GetClientData(std::span input, std::vector& output); - NvResult ZCullBind(std::span input, std::vector& output); - NvResult SetErrorNotifier(std::span input, std::vector& output); - NvResult SetChannelPriority(std::span input, std::vector& output); - NvResult AllocGPFIFOEx2(std::span input, std::vector& output); - NvResult AllocateObjectContext(std::span input, std::vector& output); - NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector& output, + NvResult SetNVMAPfd(std::span input, std::span output); + NvResult SetClientData(std::span input, std::span output); + NvResult GetClientData(std::span input, std::span output); + NvResult ZCullBind(std::span input, std::span output); + NvResult SetErrorNotifier(std::span input, std::span output); + NvResult SetChannelPriority(std::span input, std::span output); + NvResult AllocGPFIFOEx2(std::span input, std::span output); + NvResult AllocateObjectContext(std::span input, std::span output); + NvResult SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::span output, Tegra::CommandList&& entries); - NvResult SubmitGPFIFOBase(std::span input, std::vector& output, + NvResult SubmitGPFIFOBase(std::span input, std::span output, bool kickoff = false); NvResult SubmitGPFIFOBase(std::span input, std::span input_inline, - std::vector& output); - NvResult GetWaitbase(std::span input, std::vector& output); - NvResult ChannelSetTimeout(std::span input, std::vector& output); - NvResult ChannelSetTimeslice(std::span input, std::vector& output); + std::span output); + NvResult GetWaitbase(std::span input, std::span output); + NvResult ChannelSetTimeout(std::span input, std::span output); + NvResult ChannelSetTimeslice(std::span input, std::span output); EventInterface& events_interface; NvCore::Container& core; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index dc45169ad..a174442a6 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -16,7 +16,7 @@ nvhost_nvdec::nvhost_nvdec(Core::System& system_, NvCore::Container& core_) nvhost_nvdec::~nvhost_nvdec() = default; NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -56,13 +56,13 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span in } NvResult nvhost_nvdec::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_nvdec::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h index 0d615bbcb..ad2233c49 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h @@ -14,11 +14,11 @@ public: ~nvhost_nvdec() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index 1ab51f10b..61649aa4a 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -36,7 +36,7 @@ std::size_t SliceVectors(std::span input, std::vector& dst, std::si // Writes the data in src to an offset into the dst vector. The offset is specified in bytes // Returns the number of bytes written into dst. template -std::size_t WriteVectors(std::vector& dst, const std::vector& src, std::size_t offset) { +std::size_t WriteVectors(std::span dst, const std::vector& src, std::size_t offset) { if (src.empty()) { return 0; } @@ -72,8 +72,7 @@ NvResult nvhost_nvdec_common::SetNVMAPfd(std::span input) { return NvResult::Success; } -NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, - std::vector& output) { +NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, std::span output) { IoctlSubmit params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlSubmit)); LOG_DEBUG(Service_NVDRV, "called NVDEC Submit, cmd_buffer_count={}", params.cmd_buffer_count); @@ -121,7 +120,7 @@ NvResult nvhost_nvdec_common::Submit(DeviceFD fd, std::span input, return NvResult::Success; } -NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::span output) { IoctlGetSyncpoint params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlGetSyncpoint)); LOG_DEBUG(Service_NVDRV, "called GetSyncpoint, id={}", params.param); @@ -133,7 +132,7 @@ NvResult nvhost_nvdec_common::GetSyncpoint(std::span input, std::vecto return NvResult::Success; } -NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::span output) { IoctlGetWaitbase params{}; LOG_CRITICAL(Service_NVDRV, "called WAITBASE"); std::memcpy(¶ms, input.data(), sizeof(IoctlGetWaitbase)); @@ -142,7 +141,7 @@ NvResult nvhost_nvdec_common::GetWaitbase(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::span output) { IoctlMapBuffer params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); std::vector cmd_buffer_handles(params.num_entries); @@ -159,7 +158,7 @@ NvResult nvhost_nvdec_common::MapBuffer(std::span input, std::vector input, std::vector& output) { +NvResult nvhost_nvdec_common::UnmapBuffer(std::span input, std::span output) { IoctlMapBuffer params{}; std::memcpy(¶ms, input.data(), sizeof(IoctlMapBuffer)); std::vector cmd_buffer_handles(params.num_entries); @@ -173,7 +172,7 @@ NvResult nvhost_nvdec_common::UnmapBuffer(std::span input, std::vector return NvResult::Success; } -NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span input, std::vector& output) { +NvResult nvhost_nvdec_common::SetSubmitTimeout(std::span input, std::span output) { std::memcpy(&submit_timeout, input.data(), input.size()); LOG_WARNING(Service_NVDRV, "(STUBBED) called"); return NvResult::Success; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 5af26a26f..9bb573bfe 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -108,12 +108,12 @@ protected: /// Ioctl command implementations NvResult SetNVMAPfd(std::span input); - NvResult Submit(DeviceFD fd, std::span input, std::vector& output); - NvResult GetSyncpoint(std::span input, std::vector& output); - NvResult GetWaitbase(std::span input, std::vector& output); - NvResult MapBuffer(std::span input, std::vector& output); - NvResult UnmapBuffer(std::span input, std::vector& output); - NvResult SetSubmitTimeout(std::span input, std::vector& output); + NvResult Submit(DeviceFD fd, std::span input, std::span output); + NvResult GetSyncpoint(std::span input, std::span output); + NvResult GetWaitbase(std::span input, std::span output); + NvResult MapBuffer(std::span input, std::span output); + NvResult UnmapBuffer(std::span input, std::span output); + NvResult SetSubmitTimeout(std::span input, std::span output); Kernel::KEvent* QueryEvent(u32 event_id) override; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp index 39f30e7c8..a05c8cdae 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp @@ -13,7 +13,7 @@ nvhost_nvjpg::nvhost_nvjpg(Core::System& system_) : nvdevice{system_} {} nvhost_nvjpg::~nvhost_nvjpg() = default; NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 'H': switch (command.cmd) { @@ -32,13 +32,13 @@ NvResult nvhost_nvjpg::Ioctl1(DeviceFD fd, Ioctl command, std::span in } NvResult nvhost_nvjpg::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -46,7 +46,7 @@ NvResult nvhost_nvjpg::Ioctl3(DeviceFD fd, Ioctl command, std::span in void nvhost_nvjpg::OnOpen(DeviceFD fd) {} void nvhost_nvjpg::OnClose(DeviceFD fd) {} -NvResult nvhost_nvjpg::SetNVMAPfd(std::span input, std::vector& output) { +NvResult nvhost_nvjpg::SetNVMAPfd(std::span input, std::span output) { IoctlSetNvmapFD params{}; std::memcpy(¶ms, input.data(), input.size()); LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd); diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h index 41b57e872..5623e0d47 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h @@ -16,11 +16,11 @@ public: ~nvhost_nvjpg() override; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -33,7 +33,7 @@ private: s32_le nvmap_fd{}; - NvResult SetNVMAPfd(std::span input, std::vector& output); + NvResult SetNVMAPfd(std::span input, std::span output); }; } // namespace Service::Nvidia::Devices diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index b0ea402a7..c0b8684c3 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -16,7 +16,7 @@ nvhost_vic::nvhost_vic(Core::System& system_, NvCore::Container& core_) nvhost_vic::~nvhost_vic() = default; NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x0: switch (command.cmd) { @@ -56,13 +56,13 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span inpu } NvResult nvhost_vic::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { + std::span output, std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.h b/src/core/hle/service/nvdrv/devices/nvhost_vic.h index b5e350a83..cadbcb0a5 100644 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h @@ -13,11 +13,11 @@ public: ~nvhost_vic(); NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; diff --git a/src/core/hle/service/nvdrv/devices/nvmap.cpp b/src/core/hle/service/nvdrv/devices/nvmap.cpp index 07417f045..e7f7e273b 100644 --- a/src/core/hle/service/nvdrv/devices/nvmap.cpp +++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp @@ -26,7 +26,7 @@ nvmap::nvmap(Core::System& system_, NvCore::Container& container_) nvmap::~nvmap() = default; NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { switch (command.group) { case 0x1: switch (command.cmd) { @@ -55,13 +55,13 @@ NvResult nvmap::Ioctl1(DeviceFD fd, Ioctl command, std::span input, } NvResult nvmap::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } -NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { +NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) { UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw); return NvResult::NotImplemented; } @@ -69,7 +69,7 @@ NvResult nvmap::Ioctl3(DeviceFD fd, Ioctl command, std::span input, void nvmap::OnOpen(DeviceFD fd) {} void nvmap::OnClose(DeviceFD fd) {} -NvResult nvmap::IocCreate(std::span input, std::vector& output) { +NvResult nvmap::IocCreate(std::span input, std::span output) { IocCreateParams params; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_DEBUG(Service_NVDRV, "called, size=0x{:08X}", params.size); @@ -89,7 +89,7 @@ NvResult nvmap::IocCreate(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocAlloc(std::span input, std::vector& output) { +NvResult nvmap::IocAlloc(std::span input, std::span output) { IocAllocParams params; std::memcpy(¶ms, input.data(), sizeof(params)); LOG_DEBUG(Service_NVDRV, "called, addr={:X}", params.address); @@ -137,7 +137,7 @@ NvResult nvmap::IocAlloc(std::span input, std::vector& output) { return result; } -NvResult nvmap::IocGetId(std::span input, std::vector& output) { +NvResult nvmap::IocGetId(std::span input, std::span output) { IocGetIdParams params; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -161,7 +161,7 @@ NvResult nvmap::IocGetId(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocFromId(std::span input, std::vector& output) { +NvResult nvmap::IocFromId(std::span input, std::span output) { IocFromIdParams params; std::memcpy(¶ms, input.data(), sizeof(params)); @@ -192,7 +192,7 @@ NvResult nvmap::IocFromId(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocParam(std::span input, std::vector& output) { +NvResult nvmap::IocParam(std::span input, std::span output) { enum class ParamTypes { Size = 1, Alignment = 2, Base = 3, Heap = 4, Kind = 5, Compr = 6 }; IocParamParams params; @@ -241,7 +241,7 @@ NvResult nvmap::IocParam(std::span input, std::vector& output) { return NvResult::Success; } -NvResult nvmap::IocFree(std::span input, std::vector& output) { +NvResult nvmap::IocFree(std::span input, std::span output) { IocFreeParams params; std::memcpy(¶ms, input.data(), sizeof(params)); diff --git a/src/core/hle/service/nvdrv/devices/nvmap.h b/src/core/hle/service/nvdrv/devices/nvmap.h index 82bd3b118..40c65b430 100644 --- a/src/core/hle/service/nvdrv/devices/nvmap.h +++ b/src/core/hle/service/nvdrv/devices/nvmap.h @@ -27,11 +27,11 @@ public: nvmap& operator=(const nvmap&) = delete; NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) override; + std::span output) override; NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) override; - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output) override; + std::span inline_input, std::span output) override; + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) override; void OnOpen(DeviceFD fd) override; void OnClose(DeviceFD fd) override; @@ -106,12 +106,12 @@ private: }; static_assert(sizeof(IocGetIdParams) == 8, "IocGetIdParams has wrong size"); - NvResult IocCreate(std::span input, std::vector& output); - NvResult IocAlloc(std::span input, std::vector& output); - NvResult IocGetId(std::span input, std::vector& output); - NvResult IocFromId(std::span input, std::vector& output); - NvResult IocParam(std::span input, std::vector& output); - NvResult IocFree(std::span input, std::vector& output); + NvResult IocCreate(std::span input, std::span output); + NvResult IocAlloc(std::span input, std::span output); + NvResult IocGetId(std::span input, std::span output); + NvResult IocFromId(std::span input, std::span output); + NvResult IocParam(std::span input, std::span output); + NvResult IocFree(std::span input, std::span output); NvCore::Container& container; NvCore::NvMap& file; diff --git a/src/core/hle/service/nvdrv/nvdrv.cpp b/src/core/hle/service/nvdrv/nvdrv.cpp index 3d774eec4..9e46ee8dd 100644 --- a/src/core/hle/service/nvdrv/nvdrv.cpp +++ b/src/core/hle/service/nvdrv/nvdrv.cpp @@ -130,7 +130,7 @@ DeviceFD Module::Open(const std::string& device_name) { } NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span input, - std::vector& output) { + std::span output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; @@ -147,7 +147,7 @@ NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, std::span input, } NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output) { + std::span inline_input, std::span output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; @@ -163,8 +163,8 @@ NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, std::span input, return itr->second->Ioctl2(fd, command, input, inline_input, output); } -NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span input, - std::vector& output, std::vector& inline_output) { +NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output) { if (fd < 0) { LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd); return NvResult::InvalidState; diff --git a/src/core/hle/service/nvdrv/nvdrv.h b/src/core/hle/service/nvdrv/nvdrv.h index 668be742b..d8622b3ca 100644 --- a/src/core/hle/service/nvdrv/nvdrv.h +++ b/src/core/hle/service/nvdrv/nvdrv.h @@ -80,13 +80,13 @@ public: DeviceFD Open(const std::string& device_name); /// Sends an ioctl command to the specified file descriptor. - NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, std::vector& output); + NvResult Ioctl1(DeviceFD fd, Ioctl command, std::span input, std::span output); NvResult Ioctl2(DeviceFD fd, Ioctl command, std::span input, - std::span inline_input, std::vector& output); + std::span inline_input, std::span output); - NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::vector& output, - std::vector& inline_output); + NvResult Ioctl3(DeviceFD fd, Ioctl command, std::span input, std::span output, + std::span inline_output); /// Closes a device file descriptor and returns operation success. NvResult Close(DeviceFD fd); diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.cpp b/src/core/hle/service/nvdrv/nvdrv_interface.cpp index d010a1e03..348207e25 100644 --- a/src/core/hle/service/nvdrv/nvdrv_interface.cpp +++ b/src/core/hle/service/nvdrv/nvdrv_interface.cpp @@ -63,12 +63,12 @@ void NVDRV::Ioctl1(HLERequestContext& ctx) { } // Check device - std::vector output_buffer(ctx.GetWriteBufferSize(0)); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); const auto input_buffer = ctx.ReadBuffer(0); - const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, output_buffer); + const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, tmp_output); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer); + ctx.WriteBuffer(tmp_output); } IPC::ResponseBuilder rb{ctx, 3}; @@ -90,12 +90,12 @@ void NVDRV::Ioctl2(HLERequestContext& ctx) { const auto input_buffer = ctx.ReadBuffer(0); const auto input_inlined_buffer = ctx.ReadBuffer(1); - std::vector output_buffer(ctx.GetWriteBufferSize(0)); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); const auto nv_result = - nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, output_buffer); + nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, tmp_output); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer); + ctx.WriteBuffer(tmp_output); } IPC::ResponseBuilder rb{ctx, 3}; @@ -116,14 +116,12 @@ void NVDRV::Ioctl3(HLERequestContext& ctx) { } const auto input_buffer = ctx.ReadBuffer(0); - std::vector output_buffer(ctx.GetWriteBufferSize(0)); - std::vector output_buffer_inline(ctx.GetWriteBufferSize(1)); - - const auto nv_result = - nvdrv->Ioctl3(fd, command, input_buffer, output_buffer, output_buffer_inline); + tmp_output.resize_destructive(ctx.GetWriteBufferSize(0)); + tmp_output_inline.resize_destructive(ctx.GetWriteBufferSize(1)); + const auto nv_result = nvdrv->Ioctl3(fd, command, input_buffer, tmp_output, tmp_output_inline); if (command.is_out != 0) { - ctx.WriteBuffer(output_buffer, 0); - ctx.WriteBuffer(output_buffer_inline, 1); + ctx.WriteBuffer(tmp_output, 0); + ctx.WriteBuffer(tmp_output_inline, 1); } IPC::ResponseBuilder rb{ctx, 3}; diff --git a/src/core/hle/service/nvdrv/nvdrv_interface.h b/src/core/hle/service/nvdrv/nvdrv_interface.h index 881ea1a6b..4b593ff90 100644 --- a/src/core/hle/service/nvdrv/nvdrv_interface.h +++ b/src/core/hle/service/nvdrv/nvdrv_interface.h @@ -4,6 +4,7 @@ #pragma once #include +#include "common/scratch_buffer.h" #include "core/hle/service/nvdrv/nvdrv.h" #include "core/hle/service/service.h" @@ -33,6 +34,8 @@ private: u64 pid{}; bool is_initialized{}; + Common::ScratchBuffer tmp_output; + Common::ScratchBuffer tmp_output_inline; }; } // namespace Service::Nvidia diff --git a/src/core/hle/service/nvnflinger/parcel.h b/src/core/hle/service/nvnflinger/parcel.h index fb56d75d7..23ba315a0 100644 --- a/src/core/hle/service/nvnflinger/parcel.h +++ b/src/core/hle/service/nvnflinger/parcel.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/alignment.h" #include "common/assert.h" @@ -167,7 +168,7 @@ public: private: template requires(std::is_trivially_copyable_v) - void WriteImpl(const T& val, std::vector& buffer) { + void WriteImpl(const T& val, boost::container::small_vector& buffer) { const size_t aligned_size = Common::AlignUp(sizeof(T), 4); const size_t old_size = buffer.size(); buffer.resize(old_size + aligned_size); @@ -176,8 +177,8 @@ private: } private: - std::vector m_data_buffer; - std::vector m_object_buffer; + boost::container::small_vector m_data_buffer; + boost::container::small_vector m_object_buffer; }; } // namespace Service::android diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp index c3c2281bb..9ff4028c2 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp @@ -479,7 +479,7 @@ void EmitContext::DefineGenericOutput(size_t index, u32 invocations) { const u32 remainder{4 - element}; const TransformFeedbackVarying* xfb_varying{}; const size_t xfb_varying_index{base_index + element}; - if (xfb_varying_index < runtime_info.xfb_varyings.size()) { + if (xfb_varying_index < runtime_info.xfb_count) { xfb_varying = &runtime_info.xfb_varyings[xfb_varying_index]; xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index 0f86a8004..34592a01f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -387,7 +387,7 @@ void SetupSignedNanCapabilities(const Profile& profile, const IR::Program& progr } void SetupTransformFeedbackCapabilities(EmitContext& ctx, Id main_func) { - if (ctx.runtime_info.xfb_varyings.empty()) { + if (ctx.runtime_info.xfb_count == 0) { return; } ctx.AddCapability(spv::Capability::TransformFeedback); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index fd15f47ea..bec5db173 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -160,7 +160,7 @@ void DefineGenericOutput(EmitContext& ctx, size_t index, std::optional invo const u32 remainder{4 - element}; const TransformFeedbackVarying* xfb_varying{}; const size_t xfb_varying_index{base_attr_index + element}; - if (xfb_varying_index < ctx.runtime_info.xfb_varyings.size()) { + if (xfb_varying_index < ctx.runtime_info.xfb_count) { xfb_varying = &ctx.runtime_info.xfb_varyings[xfb_varying_index]; xfb_varying = xfb_varying->components > 0 ? xfb_varying : nullptr; } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 3b63c249f..619c0b138 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -84,7 +84,8 @@ struct RuntimeInfo { bool glasm_use_storage_buffers{}; /// Transform feedback state for each varying - std::vector xfb_varyings; + std::array xfb_varyings{}; + u32 xfb_count{0}; }; } // namespace Shader diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 45977d578..58a45ab67 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -207,7 +207,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am if (has_new_downloads) { memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } - tmp_buffer.resize(amount); + tmp_buffer.resize_destructive(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); cpu_memory.WriteBlockUnsafe(*cpu_dest_address, tmp_buffer.data(), amount); return true; @@ -1279,7 +1279,7 @@ template typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu_addr, u32 wanted_size) { static constexpr int STREAM_LEAP_THRESHOLD = 16; - std::vector overlap_ids; + boost::container::small_vector overlap_ids; VAddr begin = cpu_addr; VAddr end = cpu_addr + wanted_size; int stream_score = 0; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 63a120f7a..fe6068cfe 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -229,7 +229,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; struct OverlapResult { - std::vector ids; + boost::container::small_vector ids; VAddr begin; VAddr end; bool has_stream_leap = false; @@ -582,7 +582,7 @@ private: BufferId inline_buffer_id; std::array> CACHING_PAGEBITS)> page_table; - std::vector tmp_buffer; + Common::ScratchBuffer tmp_buffer; }; } // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 83112dfce..7d660af47 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -63,7 +63,6 @@ struct ChCommand { }; using ChCommandHeaderList = std::vector; -using ChCommandList = std::vector; struct ThiRegisters { u32_le increment_syncpt{}; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index 1cdb690ed..8a2784cdc 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "common/bit_field.h" @@ -102,11 +103,12 @@ inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, Sub struct CommandList final { CommandList() = default; explicit CommandList(std::size_t size) : command_lists(size) {} - explicit CommandList(std::vector&& prefetch_command_list_) + explicit CommandList( + boost::container::small_vector&& prefetch_command_list_) : prefetch_command_list{std::move(prefetch_command_list_)} {} - std::vector command_lists; - std::vector prefetch_command_list; + boost::container::small_vector command_lists; + boost::container::small_vector prefetch_command_list; }; /** diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ebe5536de..bc1eb41e7 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -108,9 +108,11 @@ void MaxwellDMA::Launch() { if (regs.launch_dma.remap_enable != 0 && is_const_a_dst) { ASSERT(regs.remap_const.component_size_minus_one == 3); accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); - std::vector tmp_buffer(regs.line_length_in, regs.remap_consta_value); + read_buffer.resize_destructive(regs.line_length_in * sizeof(u32)); + std::span span(reinterpret_cast(read_buffer.data()), regs.line_length_in); + std::ranges::fill(span, regs.remap_consta_value); memory_manager.WriteBlockUnsafe(regs.offset_out, - reinterpret_cast(tmp_buffer.data()), + reinterpret_cast(read_buffer.data()), regs.line_length_in * sizeof(u32)); } else { memory_manager.FlushCaching(); @@ -126,32 +128,32 @@ void MaxwellDMA::Launch() { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { memory_manager.ReadBlockUnsafe( convert_linear_2_blocklinear_addr(regs.offset_in + offset), - tmp_buffer.data(), tmp_buffer.size()); - memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(), - tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); + memory_manager.WriteBlockCached(regs.offset_out + offset, read_buffer.data(), + read_buffer.size()); } } else if (is_src_pitch && !is_dst_pitch) { UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_in % 16 != 0); UNIMPLEMENTED_IF(regs.offset_out % 16 != 0); - std::vector tmp_buffer(16); + read_buffer.resize_destructive(16); for (u32 offset = 0; offset < regs.line_length_in; offset += 16) { - memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(), - tmp_buffer.size()); + memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), + read_buffer.size()); memory_manager.WriteBlockCached( convert_linear_2_blocklinear_addr(regs.offset_out + offset), - tmp_buffer.data(), tmp_buffer.size()); + read_buffer.data(), read_buffer.size()); } } else { if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - std::vector tmp_buffer(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), + read_buffer.resize_destructive(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), regs.line_length_in); - memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(), + memory_manager.WriteBlockCached(regs.offset_out, read_buffer.data(), regs.line_length_in); } } @@ -171,7 +173,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() { src_operand.address = regs.offset_in; DMA::BufferOperand dst_operand; - dst_operand.pitch = regs.pitch_out; + u32 abs_pitch_out = std::abs(static_cast(regs.pitch_out)); + dst_operand.pitch = abs_pitch_out; dst_operand.width = regs.line_length_in; dst_operand.height = regs.line_count; dst_operand.address = regs.offset_out; @@ -218,7 +221,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t dst_size = static_cast(regs.pitch_out) * regs.line_count; + const size_t dst_size = static_cast(abs_pitch_out) * regs.line_count; read_buffer.resize_destructive(src_size); write_buffer.resize_destructive(dst_size); @@ -227,7 +230,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() { UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, - regs.pitch_out); + abs_pitch_out); memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size); } diff --git a/src/video_core/host1x/codecs/h264.cpp b/src/video_core/host1x/codecs/h264.cpp index 6ce179167..ce827eb6c 100644 --- a/src/video_core/host1x/codecs/h264.cpp +++ b/src/video_core/host1x/codecs/h264.cpp @@ -4,6 +4,7 @@ #include #include +#include "common/scratch_buffer.h" #include "common/settings.h" #include "video_core/host1x/codecs/h264.h" #include "video_core/host1x/host1x.h" @@ -188,7 +189,8 @@ void H264BitWriter::WriteBit(bool state) { } void H264BitWriter::WriteScalingList(std::span list, s32 start, s32 count) { - std::vector scan(count); + static Common::ScratchBuffer scan{}; + scan.resize_destructive(count); if (count == 16) { std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); } else { diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index b2f7e160a..45141e488 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -587,7 +587,7 @@ void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size, void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size, VideoCommon::CacheType which) { - std::vector tmp_buffer(size); + tmp_buffer.resize_destructive(size); ReadBlock(gpu_src_addr, tmp_buffer.data(), size, which); // The output block must be flushed in case it has data modified from the GPU. @@ -670,9 +670,9 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons return result; } -std::vector> MemoryManager::GetSubmappedRange( - GPUVAddr gpu_addr, std::size_t size) const { - std::vector> result{}; +boost::container::small_vector, 32> +MemoryManager::GetSubmappedRange(GPUVAddr gpu_addr, std::size_t size) const { + boost::container::small_vector, 32> result{}; GetSubmappedRangeImpl(gpu_addr, size, result); return result; } @@ -680,8 +680,9 @@ std::vector> MemoryManager::GetSubmappedRange( template void MemoryManager::GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector, std::size_t>>& - result) const { + boost::container::small_vector< + std::pair, std::size_t>, 32>& result) + const { std::optional, std::size_t>> last_segment{}; std::optional old_page_addr{}; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 794535122..4202c26ff 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -8,10 +8,12 @@ #include #include #include +#include #include "common/common_types.h" #include "common/multi_level_page_table.h" #include "common/range_map.h" +#include "common/scratch_buffer.h" #include "common/virtual_buffer.h" #include "video_core/cache_types.h" #include "video_core/pte_kind.h" @@ -107,8 +109,8 @@ public: * if the region is continuous, a single pair will be returned. If it's unmapped, an empty * vector will be returned; */ - std::vector> GetSubmappedRange(GPUVAddr gpu_addr, - std::size_t size) const; + boost::container::small_vector, 32> GetSubmappedRange( + GPUVAddr gpu_addr, std::size_t size) const; GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size, PTEKind kind = PTEKind::INVALID, bool is_big_pages = true); @@ -165,7 +167,8 @@ private: template void GetSubmappedRangeImpl( GPUVAddr gpu_addr, std::size_t size, - std::vector, std::size_t>>& + boost::container::small_vector< + std::pair, std::size_t>, 32>& result) const; Core::System& system; @@ -215,8 +218,8 @@ private: Common::VirtualBuffer big_page_table_cpu; std::vector big_page_continuous; - std::vector> page_stash{}; - std::vector> page_stash2{}; + boost::container::small_vector, 32> page_stash{}; + boost::container::small_vector, 32> page_stash2{}; mutable std::mutex guard; @@ -226,6 +229,8 @@ private: std::unique_ptr accumulator; static std::atomic unique_identifier_generator; + + Common::ScratchBuffer tmp_buffer; }; } // namespace Tegra diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 3f077311e..0329ed820 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -85,7 +85,9 @@ Shader::RuntimeInfo MakeRuntimeInfo(const GraphicsPipelineKey& key, case Shader::Stage::VertexB: case Shader::Stage::Geometry: if (!use_assembly_shaders && key.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + auto [varyings, count] = VideoCommon::MakeTransformFeedbackVaryings(key.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } break; case Shader::Stage::TessellationEval: diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index e30fcb1ed..f47301ad5 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -361,7 +361,7 @@ void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, }; // Measuring a popular game, this number never exceeds the specified size once data is warmed up - boost::container::small_vector vk_copies(copies.size()); + boost::container::small_vector vk_copies(copies.size()); std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); scheduler.Record([src_buffer, dst_buffer, vk_copies, barrier](vk::CommandBuffer cmdbuf) { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index a2cfb2105..9f316113c 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -167,7 +167,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; } @@ -214,7 +217,10 @@ Shader::RuntimeInfo MakeRuntimeInfo(std::span program info.fixed_state_point_size = point_size; } if (key.state.xfb_enabled != 0) { - info.xfb_varyings = VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + auto [varyings, count] = + VideoCommon::MakeTransformFeedbackVaryings(key.state.xfb_state); + info.xfb_varyings = varyings; + info.xfb_count = count; } info.convert_depth_mode = gl_ndc; break; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f025f618b..f3cef09dd 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -330,9 +330,9 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { }; } -[[maybe_unused]] [[nodiscard]] std::vector TransformBufferCopies( - std::span copies, size_t buffer_offset) { - std::vector result(copies.size()); +[[maybe_unused]] [[nodiscard]] boost::container::small_vector +TransformBufferCopies(std::span copies, size_t buffer_offset) { + boost::container::small_vector result(copies.size()); std::ranges::transform( copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) { return VkBufferCopy{ @@ -344,7 +344,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { return result; } -[[nodiscard]] std::vector TransformBufferImageCopies( +[[nodiscard]] boost::container::small_vector TransformBufferImageCopies( std::span copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) { struct Maker { VkBufferImageCopy operator()(const BufferImageCopy& copy) const { @@ -377,14 +377,14 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { VkImageAspectFlags aspect_mask; }; if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - std::vector result(copies.size() * 2); + boost::container::small_vector result(copies.size() * 2); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT}); std::ranges::transform(copies, result.begin() + copies.size(), Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT}); return result; } else { - std::vector result(copies.size()); + boost::container::small_vector result(copies.size()); std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask}); return result; } @@ -867,8 +867,8 @@ void TextureCacheRuntime::BarrierFeedbackLoop() { void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src, std::span copies) { - std::vector vk_in_copies(copies.size()); - std::vector vk_out_copies(copies.size()); + boost::container::small_vector vk_in_copies(copies.size()); + boost::container::small_vector vk_out_copies(copies.size()); const VkImageAspectFlags src_aspect_mask = src.AspectMask(); const VkImageAspectFlags dst_aspect_mask = dst.AspectMask(); @@ -1157,7 +1157,7 @@ void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, Im void TextureCacheRuntime::CopyImage(Image& dst, Image& src, std::span copies) { - std::vector vk_copies(copies.size()); + boost::container::small_vector vk_copies(copies.size()); const VkImageAspectFlags aspect_mask = dst.AspectMask(); ASSERT(aspect_mask == src.AspectMask()); @@ -1332,7 +1332,7 @@ void Image::UploadMemory(VkBuffer buffer, VkDeviceSize offset, ScaleDown(true); } scheduler->RequestOutsideRenderPassOperationContext(); - std::vector vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); + auto vk_copies = TransformBufferImageCopies(copies, offset, aspect_mask); const VkBuffer src_buffer = buffer; const VkImage vk_image = *original_image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; @@ -1367,8 +1367,9 @@ void Image::DownloadMemory(std::span buffers_span, std::span buffers_vector{}; - boost::container::small_vector, 1> vk_copies; + boost::container::small_vector buffers_vector{}; + boost::container::small_vector, 8> + vk_copies; for (size_t index = 0; index < buffers_span.size(); index++) { buffers_vector.emplace_back(buffers_span[index]); vk_copies.emplace_back( @@ -1858,7 +1859,7 @@ Framebuffer::~Framebuffer() = default; void Framebuffer::CreateFramebuffer(TextureCacheRuntime& runtime, std::span color_buffers, ImageView* depth_buffer, bool is_rescaled) { - std::vector attachments; + boost::container::small_vector attachments; RenderPassKey renderpass_key{}; s32 num_layers = 1; diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index c5213875b..4db948b6d 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -151,11 +151,9 @@ void ShaderCache::RemovePendingShaders() { marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), marked_for_removal.end()); - std::vector removed_shaders; - removed_shaders.reserve(marked_for_removal.size()); + boost::container::small_vector removed_shaders; std::scoped_lock lock{lookup_mutex}; - for (Entry* const entry : marked_for_removal) { removed_shaders.push_back(entry->data); diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index 1b8a17ee8..55d49d017 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "common/common_funcs.h" #include "common/common_types.h" @@ -108,8 +109,8 @@ struct ImageBase { std::vector image_view_infos; std::vector image_view_ids; - std::vector slice_offsets; - std::vector slice_subresources; + boost::container::small_vector slice_offsets; + boost::container::small_vector slice_subresources; std::vector aliased_images; std::vector overlapping_images; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d58bb69ff..d3f03a995 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -526,7 +526,7 @@ void TextureCache

::WriteMemory(VAddr cpu_addr, size_t size) { template void TextureCache

::DownloadMemory(VAddr cpu_addr, size_t size) { - std::vector images; + boost::container::small_vector images; ForEachImageInRegion(cpu_addr, size, [&images](ImageId image_id, ImageBase& image) { if (!image.IsSafeDownload()) { return; @@ -579,7 +579,7 @@ std::optional TextureCache

::GetFlushArea(V template void TextureCache

::UnmapMemory(VAddr cpu_addr, size_t size) { - std::vector deleted_images; + boost::container::small_vector deleted_images; ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; @@ -593,7 +593,7 @@ void TextureCache

::UnmapMemory(VAddr cpu_addr, size_t size) { template void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size) { - std::vector deleted_images; + boost::container::small_vector deleted_images; ForEachImageInRegionGPU(as_id, gpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { @@ -1101,7 +1101,7 @@ ImageId TextureCache

::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, const bool native_bgr = runtime.HasNativeBgr(); const bool flexible_formats = True(options & RelaxedOptions::Format); ImageId image_id{}; - boost::container::small_vector image_ids; + boost::container::small_vector image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1622,7 +1622,7 @@ ImageId TextureCache

::FindDMAImage(const ImageInfo& info, GPUVAddr gpu_addr) } } ImageId image_id{}; - boost::container::small_vector image_ids; + boost::container::small_vector image_ids; const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { if (True(existing_image.flags & ImageFlagBits::Remapped)) { return false; @@ -1942,7 +1942,7 @@ void TextureCache

::RegisterImage(ImageId image_id) { image.map_view_id = map_id; return; } - std::vector sparse_maps{}; + boost::container::small_vector sparse_maps; ForEachSparseSegment( image, [this, image_id, &sparse_maps](GPUVAddr gpu_addr, VAddr cpu_addr, size_t size) { auto map_id = slot_map_views.insert(gpu_addr, cpu_addr, size, image_id); @@ -2217,7 +2217,7 @@ void TextureCache

::MarkModification(ImageBase& image) noexcept { template void TextureCache

::SynchronizeAliases(ImageId image_id) { - boost::container::small_vector aliased_images; + boost::container::small_vector aliased_images; Image& image = slot_images[image_id]; bool any_rescaled = True(image.flags & ImageFlagBits::Rescaled); bool any_modified = True(image.flags & ImageFlagBits::GpuModified); diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index 44232b961..e9ec91265 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -56,7 +56,7 @@ struct ImageViewInOut { struct AsyncDecodeContext { ImageId image_id; Common::ScratchBuffer decoded_data; - std::vector copies; + boost::container::small_vector copies; std::mutex mutex; std::atomic_bool complete; }; @@ -429,7 +429,7 @@ private: std::unordered_map, Common::IdentityHash> page_table; std::unordered_map, Common::IdentityHash> sparse_page_table; - std::unordered_map> sparse_views; + std::unordered_map> sparse_views; VAddr virtual_invalid_space{}; diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index 95a5b47d8..f781cb7a0 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -329,13 +329,13 @@ template [[nodiscard]] std::optional ResolveOverlapRightAddress3D( const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { - const std::vector slice_offsets = CalculateSliceOffsets(new_info); + const auto slice_offsets = CalculateSliceOffsets(new_info); const u32 diff = static_cast(overlap.gpu_addr - gpu_addr); const auto it = std::ranges::find(slice_offsets, diff); if (it == slice_offsets.end()) { return std::nullopt; } - const std::vector subresources = CalculateSliceSubresources(new_info); + const auto subresources = CalculateSliceSubresources(new_info); const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)]; const ImageInfo& info = overlap.info; if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { @@ -655,9 +655,9 @@ LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept { return sizes; } -std::vector CalculateSliceOffsets(const ImageInfo& info) { +boost::container::small_vector CalculateSliceOffsets(const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector offsets; + boost::container::small_vector offsets; offsets.reserve(NumSlices(info)); const LevelInfo level_info = MakeLevelInfo(info); @@ -679,9 +679,10 @@ std::vector CalculateSliceOffsets(const ImageInfo& info) { return offsets; } -std::vector CalculateSliceSubresources(const ImageInfo& info) { +boost::container::small_vector CalculateSliceSubresources( + const ImageInfo& info) { ASSERT(info.type == ImageType::e3D); - std::vector subresources; + boost::container::small_vector subresources; subresources.reserve(NumSlices(info)); for (s32 level = 0; level < info.resources.levels; ++level) { const s32 depth = AdjustMipSize(info.size.depth, level); @@ -723,8 +724,10 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { } } -std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, - SubresourceBase base, u32 up_scale, u32 down_shift) { +boost::container::small_vector MakeShrinkImageCopies(const ImageInfo& dst, + const ImageInfo& src, + SubresourceBase base, + u32 up_scale, u32 down_shift) { ASSERT(dst.resources.levels >= src.resources.levels); const bool is_dst_3d = dst.type == ImageType::e3D; @@ -733,7 +736,7 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn ASSERT(src.resources.levels == 1); } const bool both_2d{src.type == ImageType::e2D && dst.type == ImageType::e2D}; - std::vector copies; + boost::container::small_vector copies; copies.reserve(src.resources.levels); for (s32 level = 0; level < src.resources.levels; ++level) { ImageCopy& copy = copies.emplace_back(); @@ -770,9 +773,10 @@ std::vector MakeShrinkImageCopies(const ImageInfo& dst, const ImageIn return copies; } -std::vector MakeReinterpretImageCopies(const ImageInfo& src, u32 up_scale, - u32 down_shift) { - std::vector copies; +boost::container::small_vector MakeReinterpretImageCopies(const ImageInfo& src, + u32 up_scale, + u32 down_shift) { + boost::container::small_vector copies; copies.reserve(src.resources.levels); const bool is_3d = src.type == ImageType::e3D; for (s32 level = 0; level < src.resources.levels; ++level) { @@ -824,9 +828,11 @@ bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config return gpu_memory.GpuToCpuAddress(address, guest_size_bytes).has_value(); } -std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, - const ImageInfo& info, std::span input, - std::span output) { +boost::container::small_vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, + GPUVAddr gpu_addr, + const ImageInfo& info, + std::span input, + std::span output) { const size_t guest_size_bytes = input.size_bytes(); const u32 bpp_log2 = BytesPerBlockLog2(info.format); const Extent3D size = info.size; @@ -861,7 +867,7 @@ std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, GP info.tile_width_spacing); size_t guest_offset = 0; u32 host_offset = 0; - std::vector copies(num_levels); + boost::container::small_vector copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); @@ -978,7 +984,7 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span FullDownloadCopies(const ImageInfo& info) { +boost::container::small_vector FullDownloadCopies(const ImageInfo& info) { const Extent3D size = info.size; const u32 bytes_per_block = BytesPerBlock(info.format); if (info.type == ImageType::Linear) { @@ -1006,7 +1012,7 @@ std::vector FullDownloadCopies(const ImageInfo& info) { u32 host_offset = 0; - std::vector copies(num_levels); + boost::container::small_vector copies(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); @@ -1042,10 +1048,10 @@ Extent3D MipBlockSize(const ImageInfo& info, u32 level) { return AdjustMipBlockSize(num_tiles, level_info.block, level); } -std::vector FullUploadSwizzles(const ImageInfo& info) { +boost::container::small_vector FullUploadSwizzles(const ImageInfo& info) { const Extent2D tile_size = DefaultBlockSize(info.format); if (info.type == ImageType::Linear) { - return std::vector{SwizzleParameters{ + return {SwizzleParameters{ .num_tiles = AdjustTileSize(info.size, tile_size), .block = {}, .buffer_offset = 0, @@ -1057,7 +1063,7 @@ std::vector FullUploadSwizzles(const ImageInfo& info) { const s32 num_levels = info.resources.levels; u32 guest_offset = 0; - std::vector params(num_levels); + boost::container::small_vector params(num_levels); for (s32 level = 0; level < num_levels; ++level) { const Extent3D level_size = AdjustMipSize(size, level); const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h index 84aa6880d..ab45a43c4 100644 --- a/src/video_core/texture_cache/util.h +++ b/src/video_core/texture_cache/util.h @@ -5,6 +5,7 @@ #include #include +#include #include "common/common_types.h" #include "common/scratch_buffer.h" @@ -40,9 +41,10 @@ struct OverlapResult { [[nodiscard]] LevelArray CalculateMipLevelSizes(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector CalculateSliceOffsets(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector CalculateSliceOffsets(const ImageInfo& info); -[[nodiscard]] std::vector CalculateSliceSubresources(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector CalculateSliceSubresources( + const ImageInfo& info); [[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level); @@ -51,21 +53,18 @@ struct OverlapResult { [[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept; -[[nodiscard]] std::vector MakeShrinkImageCopies(const ImageInfo& dst, - const ImageInfo& src, - SubresourceBase base, u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector MakeShrinkImageCopies( + const ImageInfo& dst, const ImageInfo& src, SubresourceBase base, u32 up_scale = 1, + u32 down_shift = 0); -[[nodiscard]] std::vector MakeReinterpretImageCopies(const ImageInfo& src, - u32 up_scale = 1, - u32 down_shift = 0); +[[nodiscard]] boost::container::small_vector MakeReinterpretImageCopies( + const ImageInfo& src, u32 up_scale = 1, u32 down_shift = 0); [[nodiscard]] bool IsValidEntry(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); -[[nodiscard]] std::vector UnswizzleImage(Tegra::MemoryManager& gpu_memory, - GPUVAddr gpu_addr, const ImageInfo& info, - std::span input, - std::span output); +[[nodiscard]] boost::container::small_vector UnswizzleImage( + Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span input, std::span output); [[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageBase& image, std::span output); @@ -73,13 +72,15 @@ struct OverlapResult { void ConvertImage(std::span input, const ImageInfo& info, std::span output, std::span copies); -[[nodiscard]] std::vector FullDownloadCopies(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector FullDownloadCopies( + const ImageInfo& info); [[nodiscard]] Extent3D MipSize(Extent3D size, u32 level); [[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level); -[[nodiscard]] std::vector FullUploadSwizzles(const ImageInfo& info); +[[nodiscard]] boost::container::small_vector FullUploadSwizzles( + const ImageInfo& info); void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, std::span copies, std::span memory, diff --git a/src/video_core/transform_feedback.cpp b/src/video_core/transform_feedback.cpp index 155599316..1f353d2df 100644 --- a/src/video_core/transform_feedback.cpp +++ b/src/video_core/transform_feedback.cpp @@ -13,7 +13,7 @@ namespace VideoCommon { -std::vector MakeTransformFeedbackVaryings( +std::pair, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state) { static constexpr std::array VECTORS{ 28U, // gl_Position @@ -62,7 +62,8 @@ std::vector MakeTransformFeedbackVaryings( 216U, // gl_TexCoord[6] 220U, // gl_TexCoord[7] }; - std::vector xfb(256); + std::array xfb{}; + u32 count{0}; for (size_t buffer = 0; buffer < state.layouts.size(); ++buffer) { const auto& locations = state.varyings[buffer]; const auto& layout = state.layouts[buffer]; @@ -103,11 +104,12 @@ std::vector MakeTransformFeedbackVaryings( } } xfb[attribute] = varying; + count = std::max(count, attribute); highest = std::max(highest, (base_offset + varying.components) * 4); } UNIMPLEMENTED_IF(highest != layout.stride); } - return xfb; + return {xfb, count + 1}; } } // namespace VideoCommon diff --git a/src/video_core/transform_feedback.h b/src/video_core/transform_feedback.h index d13eb16c3..401b1352a 100644 --- a/src/video_core/transform_feedback.h +++ b/src/video_core/transform_feedback.h @@ -24,7 +24,7 @@ struct TransformFeedbackState { varyings; }; -std::vector MakeTransformFeedbackVaryings( +std::pair, u32> MakeTransformFeedbackVaryings( const TransformFeedbackState& state); } // namespace VideoCommon diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index fa9cde75b..b11abe311 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -316,6 +316,7 @@ NvidiaArchitecture GetNvidiaArchitecture(vk::PhysicalDevice physical, std::vector ExtensionListForVulkan( const std::set>& extensions) { std::vector output; + output.reserve(extensions.size()); for (const auto& extension : extensions) { output.push_back(extension.c_str()); } -- cgit v1.2.3 From eac46ad7ceca5e35b396a8b80bfc38dc6ef1a4fe Mon Sep 17 00:00:00 2001 From: GPUCode Date: Tue, 6 Jun 2023 23:10:06 +0300 Subject: video_core: Add BCn decoding support --- externals/CMakeLists.txt | 3 + externals/bc_decoder/bc_decoder.cpp | 1522 ++++++++++++++++++++ externals/bc_decoder/bc_decoder.h | 43 + src/video_core/CMakeLists.txt | 6 +- src/video_core/renderer_vulkan/maxwell_to_vk.cpp | 20 + src/video_core/renderer_vulkan/vk_rasterizer.cpp | 7 + .../renderer_vulkan/vk_texture_cache.cpp | 4 + src/video_core/surface.cpp | 22 + src/video_core/surface.h | 2 + src/video_core/texture_cache/decode_bc.cpp | 129 ++ src/video_core/texture_cache/decode_bc.h | 19 + src/video_core/texture_cache/decode_bc4.cpp | 96 -- src/video_core/texture_cache/decode_bc4.h | 15 - src/video_core/texture_cache/util.cpp | 24 +- src/video_core/textures/bcn.cpp | 1 - src/video_core/textures/bcn.h | 9 +- src/video_core/vulkan_common/vulkan_device.h | 15 +- 17 files changed, 1803 insertions(+), 134 deletions(-) create mode 100644 externals/bc_decoder/bc_decoder.cpp create mode 100644 externals/bc_decoder/bc_decoder.h create mode 100644 src/video_core/texture_cache/decode_bc.cpp create mode 100644 src/video_core/texture_cache/decode_bc.h delete mode 100644 src/video_core/texture_cache/decode_bc4.cpp delete mode 100644 src/video_core/texture_cache/decode_bc4.h (limited to 'src/video_core/texture_cache') diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 0184289eb..4ff588851 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -157,6 +157,9 @@ endif() add_library(stb stb/stb_dxt.cpp) target_include_directories(stb PUBLIC ./stb) +add_library(bc_decoder bc_decoder/bc_decoder.cpp) +target_include_directories(bc_decoder PUBLIC ./bc_decoder) + if (ANDROID) if (ARCHITECTURE_arm64) add_subdirectory(libadrenotools) diff --git a/externals/bc_decoder/bc_decoder.cpp b/externals/bc_decoder/bc_decoder.cpp new file mode 100644 index 000000000..536c44f34 --- /dev/null +++ b/externals/bc_decoder/bc_decoder.cpp @@ -0,0 +1,1522 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) +// Copyright 2019 The SwiftShader Authors. All Rights Reserved. + +// This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp +// This file does not follow the Skyline code conventions but has certain Skyline specific code +// There are a lot of implicit and narrowing conversions in this file due to this (Warnings are disabled as a result) + +#include +#include +#include +#include + +namespace { + constexpr int BlockWidth = 4; + constexpr int BlockHeight = 4; + + struct BC_color { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const { + Color c[4]; + c[0].extract565(c0); + c[1].extract565(c1); + if (hasSeparateAlpha || (c0 > c1)) { + c[2] = ((c[0] * 2) + c[1]) / 3; + c[3] = ((c[1] * 2) + c[0]) / 3; + } else { + c[2] = (c[0] + c[1]) >> 1; + if (hasAlphaChannel) { + c[3].clearAlpha(); + } + } + + for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) { + size_t dstOffset = j * dstPitch; + size_t idxOffset = j * BlockHeight; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, idxOffset++, dstOffset += dstBpp) { + *reinterpret_cast(dst + dstOffset) = c[getIdx(idxOffset)].pack8888(); + } + } + } + + private: + struct Color { + Color() { + c[0] = c[1] = c[2] = 0; + c[3] = 0xFF000000; + } + + void extract565(const unsigned int c565) { + c[0] = ((c565 & 0x0000001F) << 3) | ((c565 & 0x0000001C) >> 2); + c[1] = ((c565 & 0x000007E0) >> 3) | ((c565 & 0x00000600) >> 9); + c[2] = ((c565 & 0x0000F800) >> 8) | ((c565 & 0x0000E000) >> 13); + } + + unsigned int pack8888() const { + return ((c[0] & 0xFF) << 16) | ((c[1] & 0xFF) << 8) | (c[2] & 0xFF) | c[3]; + } + + void clearAlpha() { + c[3] = 0; + } + + Color operator*(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] * factor; + } + return res; + } + + Color operator/(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] / factor; + } + return res; + } + + Color operator>>(int shift) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] >> shift; + } + return res; + } + + Color operator+(Color const &obj) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] + obj.c[i]; + } + return res; + } + + private: + int c[4]; + }; + + size_t getIdx(int i) const { + size_t offset = i << 1; // 2 bytes per index + return (idx & (0x3 << offset)) >> offset; + } + + unsigned short c0; + unsigned short c1; + unsigned int idx; + }; + static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes"); + + struct BC_channel { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const { + int c[8] = {0}; + + if (isSigned) { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } else { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } + + if (c[0] > c[1]) { + for (int i = 2; i < 8; ++i) { + c[i] = ((8 - i) * c[0] + (i - 1) * c[1]) / 7; + } + } else { + for (int i = 2; i < 6; ++i) { + c[i] = ((6 - i) * c[0] + (i - 1) * c[1]) / 5; + } + c[6] = isSigned ? -128 : 0; + c[7] = isSigned ? 127 : 255; + } + + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) { + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) { + dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast(c[getIdx((j * BlockHeight) + i)]); + } + } + } + + private: + uint8_t getIdx(int i) const { + int offset = i * 3 + 16; + return static_cast((data & (0x7ull << offset)) >> offset); + } + + uint64_t data; + }; + static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes"); + + struct BC_alpha { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const { + dst += 3; // Write only to alpha (channel 3) + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) { + uint8_t *dstRow = dst; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) { + *dstRow = getAlpha(j * BlockHeight + i); + } + } + } + + private: + uint8_t getAlpha(int i) const { + int offset = i << 2; + int alpha = (data & (0xFull << offset)) >> offset; + return static_cast(alpha | (alpha << 4)); + } + + uint64_t data; + }; + static_assert(sizeof(BC_alpha) == 8, "BC_alpha must be 8 bytes"); + + namespace BC6H { + static constexpr int MaxPartitions = 64; + + // @fmt:off + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, + { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, + { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, + { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, + { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, + { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 }, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, + 0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, + 0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, + 0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, + }; + + // @fmt:on + + // 1.0f in half-precision floating point format + static constexpr uint16_t halfFloat1 = 0x3C00; + union Color { + struct RGBA { + uint16_t r = 0; + uint16_t g = 0; + uint16_t b = 0; + uint16_t a = halfFloat1; + + RGBA(uint16_t r, uint16_t g, uint16_t b) + : r(r), g(g), b(b) { + } + + RGBA &operator=(const RGBA &other) { + this->r = other.r; + this->g = other.g; + this->b = other.b; + this->a = halfFloat1; + + return *this; + } + }; + + Color(uint16_t r, uint16_t g, uint16_t b) + : rgba(r, g, b) { + } + + Color(int r, int g, int b) + : rgba((uint16_t) r, (uint16_t) g, (uint16_t) b) { + } + + Color() {} + + Color(const Color &other) { + this->rgba = other.rgba; + } + + Color &operator=(const Color &other) { + this->rgba = other.rgba; + + return *this; + } + + RGBA rgba; + uint16_t channel[4]; + }; + static_assert(sizeof(Color) == 8, "BC6h::Color must be 8 bytes long"); + + inline int32_t extendSign(int32_t val, size_t size) { + // Suppose we have a 2-bit integer being stored in 4 bit variable: + // x = 0b00AB + // + // In order to sign extend x, we need to turn the 0s into A's: + // x_extend = 0bAAAB + // + // We can do that by flipping A in x then subtracting 0b0010 from x. + // Suppose A is 1: + // x = 0b001B + // x_flip = 0b000B + // x_minus = 0b111B + // Since A is flipped to 0, subtracting the mask sets it and all the bits above it to 1. + // And if A is 0: + // x = 0b000B + // x_flip = 0b001B + // x_minus = 0b000B + // We unset the bit we flipped, and touch no other bit + uint16_t mask = 1u << (size - 1); + return (val ^ mask) - mask; + } + + static int constexpr RGBfChannels = 3; + struct RGBf { + uint16_t channel[RGBfChannels]; + size_t size[RGBfChannels]; + bool isSigned; + + RGBf() { + static_assert(RGBfChannels == 3, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == RGBfChannels, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == sizeof(size) / sizeof(size[0]), "RGBf requires equally sized arrays for channels and channel sizes"); + + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = 0; + size[i] = 0; + } + + isSigned = false; + } + + void extendSign() { + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = BC6H::extendSign(channel[i], size[i]); + } + } + + // Assuming this is the delta, take the base-endpoint and transform this into + // a proper endpoint. + // + // The final computed endpoint is truncated to the base-endpoint's size; + void resolveDelta(RGBf base) { + for (int i = 0; i < RGBfChannels; i++) { + size[i] = base.size[i]; + channel[i] = (base.channel[i] + channel[i]) & ((1 << base.size[i]) - 1); + } + + // Per the spec: + // "For signed formats, the results of the delta calculation must be sign + // extended as well." + if (isSigned) { + extendSign(); + } + } + + void unquantize() { + if (isSigned) { + unquantizeSigned(); + } else { + unquantizeUnsigned(); + } + } + + void unquantizeUnsigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 15 || channel[i] == 0) { + continue; + } else if (channel[i] == ((1u << size[i]) - 1)) { + channel[i] = 0xFFFFu; + } else { + // Need 32 bits to avoid overflow + uint32_t tmp = channel[i]; + channel[i] = (uint16_t) (((tmp << 16) + 0x8000) >> size[i]); + } + size[i] = 16; + } + } + + void unquantizeSigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 16 || channel[i] == 0) { + continue; + } + + int16_t value = (int16_t)channel[i]; + int32_t result = value; + bool signBit = value < 0; + if (signBit) { + value = -value; + } + + if (value >= ((1 << (size[i] - 1)) - 1)) { + result = 0x7FFF; + } else { + // Need 32 bits to avoid overflow + int32_t tmp = value; + result = (((tmp << 15) + 0x4000) >> (size[i] - 1)); + } + + if (signBit) { + result = -result; + } + + channel[i] = (uint16_t) result; + size[i] = 16; + } + } + }; + + struct Data { + uint64_t low64; + uint64_t high64; + + Data() = default; + + Data(uint64_t low64, uint64_t high64) + : low64(low64), high64(high64) { + } + + // Consumes the lowest N bits from from low64 and high64 where N is: + // abs(MSB - LSB) + // MSB and LSB come from the block description of the BC6h spec and specify + // the location of the bits in the returned bitstring. + // + // If MSB < LSB, then the bits are reversed. Otherwise, the bitstring is read and + // shifted without further modification. + // + uint32_t consumeBits(uint32_t MSB, uint32_t LSB) { + bool reversed = MSB < LSB; + if (reversed) { + std::swap(MSB, LSB); + } + assert(MSB - LSB + 1 < sizeof(uint32_t) * 8); + + uint32_t numBits = MSB - LSB + 1; + uint32_t mask = (1 << numBits) - 1; + // Read the low N bits + uint32_t bits = (low64 & mask); + + low64 >>= numBits; + // Put the low N bits of high64 into the high 64-N bits of low64 + low64 |= (high64 & mask) << (sizeof(high64) * 8 - numBits); + high64 >>= numBits; + + if (reversed) { + uint32_t tmp = 0; + for (uint32_t numSwaps = 0; numSwaps < numBits; numSwaps++) { + tmp <<= 1; + tmp |= (bits & 1); + bits >>= 1; + } + + bits = tmp; + } + + return bits << LSB; + } + }; + + struct IndexInfo { + uint64_t value; + int numBits; + }; + +// Interpolates between two endpoints, then does a final unquantization step + Color interpolate(RGBf e0, RGBf e1, const IndexInfo &index, bool isSigned) { + static constexpr uint32_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint32_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint32_t const *weightsN[] = { + nullptr, nullptr, nullptr, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + Color color; + uint32_t e0Weight = 64 - weights[index.value]; + uint32_t e1Weight = weights[index.value]; + + for (int i = 0; i < RGBfChannels; i++) { + int32_t e0Channel = e0.channel[i]; + int32_t e1Channel = e1.channel[i]; + + if (isSigned) { + e0Channel = extendSign(e0Channel, 16); + e1Channel = extendSign(e1Channel, 16); + } + + int32_t e0Value = e0Channel * e0Weight; + int32_t e1Value = e1Channel * e1Weight; + + uint32_t tmp = ((e0Value + e1Value + 32) >> 6); + + // Need to unquantize value to limit it to the legal range of half-precision + // floats. We do this by scaling by 31/32 or 31/64 depending on if the value + // is signed or unsigned. + if (isSigned) { + tmp = ((tmp & 0x80000000) != 0) ? (((~tmp + 1) * 31) >> 5) | 0x8000 : (tmp * 31) >> 5; + // Don't return -0.0f, just normalize it to 0.0f. + if (tmp == 0x8000) + tmp = 0; + } else { + tmp = (tmp * 31) >> 6; + } + + color.channel[i] = (uint16_t) tmp; + } + + return color; + } + + enum DataType { + // Endpoints + EP0 = 0, + EP1 = 1, + EP2 = 2, + EP3 = 3, + Mode, + Partition, + End, + }; + + enum Channel { + R = 0, + G = 1, + B = 2, + None, + }; + + struct DeltaBits { + size_t channel[3]; + + constexpr DeltaBits() + : channel{0, 0, 0} { + } + + constexpr DeltaBits(size_t r, size_t g, size_t b) + : channel{r, g, b} { + } + }; + + struct ModeDesc { + int number; + bool hasDelta; + int partitionCount; + int endpointBits; + DeltaBits deltaBits; + + constexpr ModeDesc() + : number(-1), hasDelta(false), partitionCount(0), endpointBits(0) { + } + + constexpr ModeDesc(int number, bool hasDelta, int partitionCount, int endpointBits, DeltaBits deltaBits) + : number(number), hasDelta(hasDelta), partitionCount(partitionCount), endpointBits(endpointBits), deltaBits(deltaBits) { + } + }; + + struct BlockDesc { + DataType type; + Channel channel; + int MSB; + int LSB; + ModeDesc modeDesc; + + constexpr BlockDesc() + : type(End), channel(None), MSB(0), LSB(0), modeDesc() { + } + + constexpr BlockDesc(const DataType type, Channel channel, int MSB, int LSB, ModeDesc modeDesc) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc(modeDesc) { + } + + constexpr BlockDesc(DataType type, Channel channel, int MSB, int LSB) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc() { + } + }; + +// Turns a legal mode into an index into the BlockDesc table. +// Illegal or reserved modes return -1. + static int modeToIndex(uint8_t mode) { + if (mode <= 3) { + return mode; + } else if ((mode & 0x2) != 0) { + if (mode <= 18) { +// Turns 6 into 4, 7 into 5, 10 into 6, etc. + return (mode / 2) + 1 + (mode & 0x1); + } else if (mode == 22 || mode == 26 || mode == 30) { +// Turns 22 into 11, 26 into 12, etc. + return mode / 4 + 6; + } + } + + return -1; + } + +// Returns a description of the bitfields for each mode from the LSB +// to the MSB before the index data starts. +// +// The numbers come from the BC6h block description. Each BlockDesc in the +// {Type, Channel, MSB, LSB} +// * Type describes which endpoint this is, or if this is a mode, a partition +// number, or the end of the block description. +// * Channel describes one of the 3 color channels within an endpoint +// * MSB and LSB specificy: +// * The size of the bitfield being read +// * The position of the bitfield within the variable it is being read to +// * If the bitfield is stored in reverse bit order +// If MSB < LSB then the bitfield is stored in reverse order. The size of +// the bitfield is abs(MSB-LSB+1). And the position of the bitfield within +// the variable is min(LSB, MSB). +// +// Invalid or reserved modes return an empty list. + static constexpr int NumBlocks = 14; +// The largest number of descriptions within a block. + static constexpr int MaxBlockDescIndex = 26; + static constexpr BlockDesc blockDescs[NumBlocks][MaxBlockDescIndex] = { +// @fmt:off +// Mode 0, Index 0 +{ +{ Mode, None, 1, 0, { 0, true, 2, 10, { 5, 5, 5 } } }, +{ EP2, G, 4, 4 }, { EP2, B, 4, 4 }, { EP3, B, 4, 4 }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 1, Index 1 +{ +{ Mode, None, 1, 0, { 1, true, 2, 7, { 6, 6, 6 } } }, +{ EP2, G, 5, 5 }, { EP3, G, 5, 4 }, { EP0, R, 6, 0 }, +{ EP3, B, 1, 0 }, { EP2, B, 4, 4 }, { EP0, G, 6, 0 }, +{ EP2, B, 5, 5 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 6, 0 }, { EP3, B, 3, 3 }, { EP3, B, 5, 5 }, +{ EP3, B, 4, 4 }, { EP1, R, 5, 0 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 5, 0 }, +{ EP2, B, 3, 0 }, { EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 2, Index 2 +{ +{ Mode, None, 4, 0, { 2, true, 2, 11, { 5, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP0, R, 10, 10 }, { EP2, G, 3, 0 }, +{ EP1, G, 3, 0 }, { EP0, G, 10, 10 }, { EP3, B, 0, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 3, Index 3 +{ +{ Mode, None, 4, 0, { 3, false, 1, 10, { 0, 0, 0 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 9, 0 }, { EP1, G, 9, 0 }, { EP1, B, 9, 0 }, +{ End, None, 0, 0}, +}, +// Mode 6, Index 4 +{ +{ Mode, None, 4, 0, { 6, true, 2, 11, { 4, 5, 4 } } }, // 1 1 +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP3, G, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 4, 0 }, { EP0, G, 10, 10 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 0, 0 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, // 18 19 +{ EP2, G, 4, 4 }, { EP3, B, 3, 3 }, // 2 21 +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 7, Index 5 +{ +{ Mode, None, 4, 0, { 7, true, 1, 11, { 9, 9, 9 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 8, 0 }, { EP0, R, 10, 10 }, { EP1, G, 8, 0 }, +{ EP0, G, 10, 10 }, { EP1, B, 8, 0 }, { EP0, B, 10, 10 }, +{ End, None, 0, 0}, +}, +// Mode 10, Index 6 +{ +{ Mode, None, 4, 0, { 10, true, 2, 11, { 4, 4, 5 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP2, B, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 3, 0 }, { EP0, G, 10, 10 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP0, B, 10, 10 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 1, 1 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, +{ EP3, B, 4, 4 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 11, Index 7 +{ +{ Mode, None, 4, 0, { 11, true, 1, 12, { 8, 8, 8 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 7, 0 }, { EP0, R, 10, 11 }, { EP1, G, 7, 0 }, +{ EP0, G, 10, 11 }, { EP1, B, 7, 0 }, { EP0, B, 10, 11 }, +{ End, None, 0, 0}, +}, +// Mode 14, Index 8 +{ +{ Mode, None, 4, 0, { 14, true, 2, 9, { 5, 5, 5 } } }, +{ EP0, R, 8, 0 }, { EP2, B, 4, 4 }, { EP0, G, 8, 0 }, +{ EP2, G, 4, 4 }, { EP0, B, 8, 0 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 15, Index 9 +{ +{ Mode, None, 4, 0, { 15, true, 1, 16, { 4, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 15 }, { EP1, G, 3, 0 }, +{ EP0, G, 10, 15 }, { EP1, B, 3, 0 }, { EP0, B, 10, 15 }, +{ End, None, 0, 0}, +}, +// Mode 18, Index 10 +{ +{ Mode, None, 4, 0, { 18, true, 2, 8, { 6, 5, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, G, 4, 4 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 3, 3 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 4, 0 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 5, 0 }, +{ EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 22, Index 11 +{ +{ Mode, None, 4, 0, { 22, true, 2, 8, { 5, 6, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 0, 0 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, G, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, G, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 26, Index 12 +{ +{ Mode, None, 4, 0, { 26, true, 2, 8, { 5, 5, 6 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 1, 1 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, B, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 5, 0 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 30, Index 13 +{ +{ Mode, None, 4, 0, { 30, false, 2, 6, { 0, 0, 0 } } }, +{ EP0, R, 5, 0 }, { EP3, G, 4, 4 }, { EP3, B, 0, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 4, 4 }, { EP0, G, 5, 0 }, +{ EP2, G, 5, 5 }, { EP2, B, 5, 5 }, { EP3, B, 2, 2 }, +{ EP2, G, 4, 4 }, { EP0, B, 5, 0 }, { EP3, G, 5, 5 }, +{ EP3, B, 3, 3 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 5, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 5, 0 }, { EP2, B, 3, 0 }, +{ EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +} +// @fmt:on + }; + + struct Block { + uint64_t low64; + uint64_t high64; + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch, size_t dstBpp, bool isSigned) const { + uint8_t mode = 0; + Data data(low64, high64); + assert(dstBpp == sizeof(Color)); + + if ((data.low64 & 0x2) == 0) { + mode = data.consumeBits(1, 0); + } else { + mode = data.consumeBits(4, 0); + } + + int blockIndex = modeToIndex(mode); + // Handle illegal or reserved mode + if (blockIndex == -1) { + for (int y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (int x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgba = {0, 0, 0}; + } + } + return; + } + const BlockDesc *blockDesc = blockDescs[blockIndex]; + + RGBf e[4]; + e[0].isSigned = e[1].isSigned = e[2].isSigned = e[3].isSigned = isSigned; + + int partition = 0; + ModeDesc modeDesc; + for (int index = 0; blockDesc[index].type != End; index++) { + const BlockDesc desc = blockDesc[index]; + + switch (desc.type) { + case Mode: + modeDesc = desc.modeDesc; + assert(modeDesc.number == mode); + + e[0].size[0] = e[0].size[1] = e[0].size[2] = modeDesc.endpointBits; + for (int i = 0; i < RGBfChannels; i++) { + if (modeDesc.hasDelta) { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.deltaBits.channel[i]; + } else { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.endpointBits; + } + } + break; + case Partition: + partition |= data.consumeBits(desc.MSB, desc.LSB); + break; + case EP0: + case EP1: + case EP2: + case EP3: + e[desc.type].channel[desc.channel] |= data.consumeBits(desc.MSB, desc.LSB); + break; + default: + assert(false); + return; + } + } + + // Sign extension + if (isSigned) { + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } else if (modeDesc.hasDelta) { + // Don't sign-extend the base endpoint in an unsigned format. + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } + + // Turn the deltas into endpoints + if (modeDesc.hasDelta) { + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].resolveDelta(e[0]); + } + } + + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].unquantize(); + } + + // Get the indices, calculate final colors, and output + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + int pixelNum = x + y * 4; + IndexInfo idx; + bool isAnchor = false; + int firstEndpoint = 0; + // Bc6H can have either 1 or 2 petitions depending on the mode. + // The number of petitions affects the number of indices with implicit + // leading 0 bits and the number of bits per index. + if (modeDesc.partitionCount == 1) { + idx.numBits = 4; + // There's an implicit leading 0 bit for the first idx + isAnchor = (pixelNum == 0); + } else { + idx.numBits = 3; + // There are 2 indices with implicit leading 0-bits. + isAnchor = ((pixelNum == 0) || (pixelNum == AnchorTable2[partition])); + firstEndpoint = PartitionTable2[partition][pixelNum] * 2; + } + + idx.value = data.consumeBits(idx.numBits - isAnchor - 1, 0); + + // Don't exit the loop early, we need to consume these index bits regardless if + // we actually output them or not. + if ((y + dstY >= dstHeight) || (x + dstX >= dstWidth)) { + continue; + } + + Color color = interpolate(e[firstEndpoint], e[firstEndpoint + 1], idx, isSigned); + auto out = reinterpret_cast(dst + dstBpp * x + dstPitch * y); + *out = color; + } + } + } + }; + + } // namespace BC6H + + namespace BC7 { +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_bptc.txt +// https://docs.microsoft.com/en-us/windows/win32/direct3d11/bc7-format + + struct Bitfield { + int offset; + int count; + + constexpr Bitfield Then(const int bits) { return {offset + count, bits}; } + + constexpr bool operator==(const Bitfield &rhs) { + return offset == rhs.offset && count == rhs.count; + } + }; + + struct Mode { + const int IDX; // Mode index + const int NS; // Number of subsets in each partition + const int PB; // Partition bits + const int RB; // Rotation bits + const int ISB; // Index selection bits + const int CB; // Color bits + const int AB; // Alpha bits + const int EPB; // Endpoint P-bits + const int SPB; // Shared P-bits + const int IB; // Primary index bits per element + const int IBC; // Primary index bits total + const int IB2; // Secondary index bits per element + + constexpr int NumColors() const { return NS * 2; } + + constexpr Bitfield Partition() const { return {IDX + 1, PB}; } + + constexpr Bitfield Rotation() const { return Partition().Then(RB); } + + constexpr Bitfield IndexSelection() const { return Rotation().Then(ISB); } + + constexpr Bitfield Red(int idx) const { + return IndexSelection().Then(CB * idx).Then(CB); + } + + constexpr Bitfield Green(int idx) const { + return Red(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Blue(int idx) const { + return Green(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Alpha(int idx) const { + return Blue(NumColors() - 1).Then(AB * idx).Then(AB); + } + + constexpr Bitfield EndpointPBit(int idx) const { + return Alpha(NumColors() - 1).Then(EPB * idx).Then(EPB); + } + + constexpr Bitfield SharedPBit0() const { + return EndpointPBit(NumColors() - 1).Then(SPB); + } + + constexpr Bitfield SharedPBit1() const { + return SharedPBit0().Then(SPB); + } + + constexpr Bitfield PrimaryIndex(int offset, int count) const { + return SharedPBit1().Then(offset).Then(count); + } + + constexpr Bitfield SecondaryIndex(int offset, int count) const { + return SharedPBit1().Then(IBC + offset).Then(count); + } + }; + + static constexpr Mode Modes[] = { + // IDX NS PB RB ISB CB AB EPB SPB IB IBC, IB2 + /**/ {0x0, 0x3, 0x4, 0x0, 0x0, 0x4, 0x0, 0x1, 0x0, 0x3, 0x2d, 0x0}, +/**/ {0x1, 0x2, 0x6, 0x0, 0x0, 0x6, 0x0, 0x0, 0x1, 0x3, 0x2e, 0x0}, +/**/ {0x2, 0x3, 0x6, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x1d, 0x0}, +/**/ {0x3, 0x2, 0x6, 0x0, 0x0, 0x7, 0x0, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {0x4, 0x1, 0x0, 0x2, 0x1, 0x5, 0x6, 0x0, 0x0, 0x2, 0x1f, 0x3}, +/**/ {0x5, 0x1, 0x0, 0x2, 0x0, 0x7, 0x8, 0x0, 0x0, 0x2, 0x1f, 0x2}, +/**/ {0x6, 0x1, 0x0, 0x0, 0x0, 0x7, 0x7, 0x1, 0x0, 0x4, 0x3f, 0x0}, +/**/ {0x7, 0x2, 0x6, 0x0, 0x0, 0x5, 0x5, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {-1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x00, 0x0}, + }; + + static constexpr int MaxPartitions = 64; + static constexpr int MaxSubsets = 3; + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0}, + {0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1}, + {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1}, + {0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1}, + {0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0}, + {0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}, + {0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1}, + {0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0}, + {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1}, + }; + + static constexpr uint8_t PartitionTable3[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2}, + {0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2}, + {0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0}, + {0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0}, + {0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1}, + {0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2}, + {0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2}, + {0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0}, + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0}, + {0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1}, + {0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1}, + {0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1}, + {0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2}, + {0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2}, + {0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2}, + {0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2}, + {0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1}, + {0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0}, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { +// @fmt:off +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, +0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, +0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, +0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, +0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, +0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3a[MaxPartitions] = { +// @fmt:off +0x3, 0x3, 0xf, 0xf, 0x8, 0x3, 0xf, 0xf, +0x8, 0x8, 0x6, 0x6, 0x6, 0x5, 0x3, 0x3, +0x3, 0x3, 0x8, 0xf, 0x3, 0x3, 0x6, 0xa, +0x5, 0x8, 0x8, 0x6, 0x8, 0x5, 0xf, 0xf, +0x8, 0xf, 0x3, 0x5, 0x6, 0xa, 0x8, 0xf, +0xf, 0x3, 0xf, 0x5, 0xf, 0xf, 0xf, 0xf, +0x3, 0xf, 0x5, 0x5, 0x5, 0x8, 0x5, 0xa, +0x5, 0xa, 0x8, 0xd, 0xf, 0xc, 0x3, 0x3, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3b[MaxPartitions] = { +// @fmt:off +0xf, 0x8, 0x8, 0x3, 0xf, 0xf, 0x3, 0x8, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x8, +0xf, 0x8, 0xf, 0x3, 0xf, 0x8, 0xf, 0x8, +0x3, 0xf, 0x6, 0xa, 0xf, 0xf, 0xa, 0x8, +0xf, 0x3, 0xf, 0xa, 0xa, 0x8, 0x9, 0xa, +0x6, 0xf, 0x8, 0xf, 0x3, 0x6, 0x6, 0x8, +0xf, 0x3, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0x3, 0xf, 0xf, 0x8, +// @fmt:on + }; + + struct Color { + struct RGB { + RGB() = default; + + RGB(uint8_t r, uint8_t g, uint8_t b) + : b(b), g(g), r(r) {} + + RGB(int r, int g, int b) + : b(static_cast(b)), g(static_cast(g)), r(static_cast(r)) {} + + RGB operator<<(int shift) const { return {r << shift, g << shift, b << shift}; } + + RGB operator>>(int shift) const { return {r >> shift, g >> shift, b >> shift}; } + + RGB operator|(int bits) const { return {r | bits, g | bits, b | bits}; } + + RGB operator|(const RGB &rhs) const { return {r | rhs.r, g | rhs.g, b | rhs.b}; } + + RGB operator+(const RGB &rhs) const { return {r + rhs.r, g + rhs.g, b + rhs.b}; } + + uint8_t b; + uint8_t g; + uint8_t r; + }; + + RGB rgb; + uint8_t a; + }; + + static_assert(sizeof(Color) == 4, "Color size must be 4 bytes"); + + struct Block { + constexpr uint64_t Get(const Bitfield &bf) const { + uint64_t mask = (1ULL << bf.count) - 1; + if (bf.offset + bf.count <= 64) { + return (low >> bf.offset) & mask; + } + if (bf.offset >= 64) { + return (high >> (bf.offset - 64)) & mask; + } + return ((low >> bf.offset) | (high << (64 - bf.offset))) & mask; + } + + const Mode &mode() const { + if ((low & 0b00000001) != 0) { + return Modes[0]; + } + if ((low & 0b00000010) != 0) { + return Modes[1]; + } + if ((low & 0b00000100) != 0) { + return Modes[2]; + } + if ((low & 0b00001000) != 0) { + return Modes[3]; + } + if ((low & 0b00010000) != 0) { + return Modes[4]; + } + if ((low & 0b00100000) != 0) { + return Modes[5]; + } + if ((low & 0b01000000) != 0) { + return Modes[6]; + } + if ((low & 0b10000000) != 0) { + return Modes[7]; + } + return Modes[8]; // Invalid mode + } + + struct IndexInfo { + uint64_t value; + int numBits; + }; + + uint8_t interpolate(uint8_t e0, uint8_t e1, const IndexInfo &index) const { + static constexpr uint16_t weights2[] = {0, 21, 43, 64}; + static constexpr uint16_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint16_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint16_t const *weightsN[] = { + nullptr, nullptr, weights2, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + return (uint8_t) (((64 - weights[index.value]) * uint16_t(e0) + weights[index.value] * uint16_t(e1) + 32) >> 6); + } + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch) const { + auto const &mode = this->mode(); + + if (mode.IDX < 0) // Invalid mode: + { + for (size_t y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (size_t x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgb = {0, 0, 0}; + out->a = 0; + } + } + return; + } + + using Endpoint = std::array; + std::array subsets; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb.r = Get(mode.Red(i * 2 + 0)); + subset[0].rgb.g = Get(mode.Green(i * 2 + 0)); + subset[0].rgb.b = Get(mode.Blue(i * 2 + 0)); + subset[0].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 0)) : 255; + + subset[1].rgb.r = Get(mode.Red(i * 2 + 1)); + subset[1].rgb.g = Get(mode.Green(i * 2 + 1)); + subset[1].rgb.b = Get(mode.Blue(i * 2 + 1)); + subset[1].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 1)) : 255; + } + + if (mode.SPB > 0) { + auto pbit0 = Get(mode.SharedPBit0()); + auto pbit1 = Get(mode.SharedPBit1()); + subsets[0][0].rgb = (subsets[0][0].rgb << 1) | pbit0; + subsets[0][1].rgb = (subsets[0][1].rgb << 1) | pbit0; + subsets[1][0].rgb = (subsets[1][0].rgb << 1) | pbit1; + subsets[1][1].rgb = (subsets[1][1].rgb << 1) | pbit1; + } + + if (mode.EPB > 0) { + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + auto pbit0 = Get(mode.EndpointPBit(i * 2 + 0)); + auto pbit1 = Get(mode.EndpointPBit(i * 2 + 1)); + subset[0].rgb = (subset[0].rgb << 1) | pbit0; + subset[1].rgb = (subset[1].rgb << 1) | pbit1; + if (mode.AB > 0) { + subset[0].a = (subset[0].a << 1) | pbit0; + subset[1].a = (subset[1].a << 1) | pbit1; + } + } + } + + auto const colorBits = mode.CB + mode.SPB + mode.EPB; + auto const alphaBits = mode.AB + mode.SPB + mode.EPB; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb = subset[0].rgb << (8 - colorBits); + subset[1].rgb = subset[1].rgb << (8 - colorBits); + subset[0].rgb = subset[0].rgb | (subset[0].rgb >> colorBits); + subset[1].rgb = subset[1].rgb | (subset[1].rgb >> colorBits); + + if (mode.AB > 0) { + subset[0].a = subset[0].a << (8 - alphaBits); + subset[1].a = subset[1].a << (8 - alphaBits); + subset[0].a = subset[0].a | (subset[0].a >> alphaBits); + subset[1].a = subset[1].a | (subset[1].a >> alphaBits); + } + } + + int colorIndexBitOffset = 0; + int alphaIndexBitOffset = 0; + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + auto texelIdx = y * 4 + x; + auto partitionIdx = Get(mode.Partition()); + assert(partitionIdx < MaxPartitions); + auto subsetIdx = subsetIndex(mode, partitionIdx, texelIdx); + assert(subsetIdx < MaxSubsets); + auto const &subset = subsets[subsetIdx]; + + auto anchorIdx = anchorIndex(mode, partitionIdx, subsetIdx); + auto isAnchor = anchorIdx == texelIdx; + auto colorIdx = colorIndex(mode, isAnchor, colorIndexBitOffset); + auto alphaIdx = alphaIndex(mode, isAnchor, alphaIndexBitOffset); + + if (y + dstY >= dstHeight || x + dstX >= dstWidth) { + // Don't be tempted to skip early at the loops: + // The calls to colorIndex() and alphaIndex() adjust bit + // offsets that need to be carefully tracked. + continue; + } + + Color output; + // Note: We flip r and b channels past this point as the texture storage is BGR while the output is RGB + output.rgb.r = interpolate(subset[0].rgb.b, subset[1].rgb.b, colorIdx); + output.rgb.g = interpolate(subset[0].rgb.g, subset[1].rgb.g, colorIdx); + output.rgb.b = interpolate(subset[0].rgb.r, subset[1].rgb.r, colorIdx); + output.a = interpolate(subset[0].a, subset[1].a, alphaIdx); + + switch (Get(mode.Rotation())) { + default: + break; + case 1: + std::swap(output.a, output.rgb.b); + break; + case 2: + std::swap(output.a, output.rgb.g); + break; + case 3: + std::swap(output.a, output.rgb.r); + break; + } + + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + *out = output; + } + } + } + + int subsetIndex(const Mode &mode, int partitionIdx, int texelIndex) const { + switch (mode.NS) { + default: + return 0; + case 2: + return PartitionTable2[partitionIdx][texelIndex]; + case 3: + return PartitionTable3[partitionIdx][texelIndex]; + } + } + + int anchorIndex(const Mode &mode, int partitionIdx, int subsetIdx) const { + // ARB_texture_compression_bptc states: + // "In partition zero, the anchor index is always index zero. + // In other partitions, the anchor index is specified by tables + // Table.A2 and Table.A3."" + // Note: This is really confusing - I believe they meant subset instead + // of partition here. + switch (subsetIdx) { + default: + return 0; + case 1: + return mode.NS == 2 ? AnchorTable2[partitionIdx] : AnchorTable3a[partitionIdx]; + case 2: + return AnchorTable3b[partitionIdx]; + } + } + + IndexInfo colorIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The index value for interpolating color comes from the secondary + // index for the texel if the format has an index selection bit and its + // value is one and from the primary index otherwise."" + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = idx == 1; + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + IndexInfo alphaIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The alpha index comes from the secondary index if the block has a + // secondary index and the block either doesn't have an index selection + // bit or that bit is zero and the primary index otherwise." + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = (mode.IB2 != 0) && (idx == 0); + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + // Assumes little-endian + uint64_t low; + uint64_t high; + }; + + } // namespace BC7 +} // anonymous namespace + +namespace bcn { + constexpr size_t R8Bpp{1}; //!< The amount of bytes per pixel in R8 + constexpr size_t R8g8Bpp{2}; //!< The amount of bytes per pixel in R8G8 + constexpr size_t R8g8b8a8Bpp{4}; //!< The amount of bytes per pixel in R8G8B8A8 + constexpr size_t R16g16b16a16Bpp{8}; //!< The amount of bytes per pixel in R16G16B16 + + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *color{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, true, false); + } + + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp); + } + + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, 3, false); + } + + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + size_t pitch{R8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8Bpp, 0, isSigned); + } + + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + const auto *green{reinterpret_cast(src + 8)}; + size_t pitch{R8g8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8g8Bpp, 0, isSigned); + green->decode(dst, x, y, width, height, pitch, R8g8Bpp, 1, isSigned); + } + + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R16g16b16a16Bpp * width}; + block->decode(dst, x, y, width, height, pitch, R16g16b16a16Bpp, isSigned); + } + + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + block->decode(dst, x, y, width, height, pitch); + } +} diff --git a/externals/bc_decoder/bc_decoder.h b/externals/bc_decoder/bc_decoder.h new file mode 100644 index 000000000..4f0ead7d3 --- /dev/null +++ b/externals/bc_decoder/bc_decoder.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include + +namespace bcn { + /** + * @brief Decodes a BC1 encoded image to R8G8B8A8 + */ + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC2 encoded image to R8G8B8A8 + */ + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC3 encoded image to R8G8B8A8 + */ + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC4 encoded image to R8 + */ + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC5 encoded image to R8G8 + */ + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC6 encoded image to R16G16B16A16 + */ + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC7 encoded image to R8G8B8A8 + */ + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); +} diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e9e6f278d..3b2fe01da 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -220,8 +220,8 @@ add_library(video_core STATIC surface.h texture_cache/accelerated_swizzle.cpp texture_cache/accelerated_swizzle.h - texture_cache/decode_bc4.cpp - texture_cache/decode_bc4.h + texture_cache/decode_bc.cpp + texture_cache/decode_bc.h texture_cache/descriptor_table.h texture_cache/formatter.cpp texture_cache/formatter.h @@ -279,7 +279,7 @@ add_library(video_core STATIC create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PUBLIC glad shader_recompiler stb) +target_link_libraries(video_core PUBLIC glad shader_recompiler stb bc_decoder) if (YUZU_USE_BUNDLED_FFMPEG AND NOT (WIN32 OR ANDROID)) add_dependencies(video_core ffmpeg-build) diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 9a0b10568..a8540339d 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -259,6 +259,26 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with break; } } + // Transcode on hardware that doesn't support BCn natively + if (!device.IsOptimalBcnSupported() && VideoCore::Surface::IsPixelFormatBCn(pixel_format)) { + const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + if (pixel_format == PixelFormat::BC4_SNORM) { + tuple.format = VK_FORMAT_R8_SNORM; + } else if (pixel_format == PixelFormat::BC4_UNORM) { + tuple.format = VK_FORMAT_R8_UNORM; + } else if (pixel_format == PixelFormat::BC5_SNORM) { + tuple.format = VK_FORMAT_R8G8_SNORM; + } else if (pixel_format == PixelFormat::BC5_UNORM) { + tuple.format = VK_FORMAT_R8G8_UNORM; + } else if (pixel_format == PixelFormat::BC6H_SFLOAT || + pixel_format == PixelFormat::BC6H_UFLOAT) { + tuple.format = VK_FORMAT_R16G16B16A16_SFLOAT; + } else if (is_srgb) { + tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32; + } else { + tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + } + } const bool attachable = (tuple.usage & Attachable) != 0; const bool storage = (tuple.usage & Storage) != 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 268b955fb..f7c0d939a 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -315,7 +315,14 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else query_cache.UpdateCounters(); +#endif auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ce6acc30c..8385b5509 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1279,6 +1279,10 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; } + if (IsPixelFormatBCn(info.format) && !runtime->device.IsOptimalBcnSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + flags |= VideoCommon::ImageFlagBits::CostlyLoad; + } if (runtime->device.HasDebuggingToolAttached()) { original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index cb51529e4..e16cd5e73 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -269,6 +269,28 @@ bool IsPixelFormatASTC(PixelFormat format) { } } +bool IsPixelFormatBCn(PixelFormat format) { + switch (format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC2_UNORM: + case PixelFormat::BC3_UNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_UNORM: + case PixelFormat::BC6H_UFLOAT: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC7_SRGB: + return true; + default: + return false; + } +} + bool IsPixelFormatSRGB(PixelFormat format) { switch (format) { case PixelFormat::A8B8G8R8_SRGB: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0225d3287..9b9c4d9bc 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -501,6 +501,8 @@ SurfaceType GetFormatType(PixelFormat pixel_format); bool IsPixelFormatASTC(PixelFormat format); +bool IsPixelFormatBCn(PixelFormat format); + bool IsPixelFormatSRGB(PixelFormat format); bool IsPixelFormatInteger(PixelFormat format); diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp new file mode 100644 index 000000000..3e26474a3 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.cpp @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc.h" + +namespace VideoCommon { + +namespace { +constexpr u32 BLOCK_SIZE = 4; + +using VideoCore::Surface::PixelFormat; + +constexpr bool IsSigned(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return true; + default: + return false; + } +} + +constexpr u32 BlockSize(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 8; + default: + return 16; + } +} +} // Anonymous namespace + +u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 1; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + return 2; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return 8; + default: + return 4; + } +} + +template +void DecompressBlocks(std::span input, std::span output, Extent3D extent, + bool is_signed = false) { + const u32 out_bpp = ConvertedBytesPerBlock(pixel_format); + const u32 block_width = std::min(extent.width, BLOCK_SIZE); + const u32 block_height = std::min(extent.height, BLOCK_SIZE); + const u32 pitch = extent.width * out_bpp; + size_t input_offset = 0; + size_t output_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 y = 0; y < extent.height; y += block_height) { + size_t row_offset = 0; + for (u32 x = 0; x < extent.width; + x += block_width, row_offset += block_width * out_bpp) { + const u8* src = input.data() + input_offset; + u8* const dst = output.data() + output_offset + row_offset; + if constexpr (IsSigned(pixel_format)) { + decompress(src, dst, x, y, extent.width, extent.height, is_signed); + } else { + decompress(src, dst, x, y, extent.width, extent.height); + } + input_offset += BlockSize(pixel_format); + } + output_offset += block_height * pitch; + } + } +} + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC1_RGBA_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC2_UNORM: + case PixelFormat::BC2_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC3_UNORM: + case PixelFormat::BC3_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC4_SNORM); + break; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC5_SNORM); + break; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC6H_SFLOAT); + break; + case PixelFormat::BC7_SRGB: + case PixelFormat::BC7_UNORM: + DecompressBlocks(input, output, extent); + break; + default: + LOG_WARNING(HW_GPU, "Unimplemented BCn decompression {}", pixel_format); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc.h b/src/video_core/texture_cache/decode_bc.h new file mode 100644 index 000000000..41d1ec0a3 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.h @@ -0,0 +1,19 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include + +#include "common/common_types.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +[[nodiscard]] u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format); + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp deleted file mode 100644 index ef98afdca..000000000 --- a/src/video_core/texture_cache/decode_bc4.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/texture_cache/decode_bc4.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt -[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { - const u32 code_offset = 16 + 3 * (4 * y + x); - const u32 code = (bits >> code_offset) & 7; - const u32 red0 = (bits >> 0) & 0xff; - const u32 red1 = (bits >> 8) & 0xff; - if (red0 > red1) { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (6 * red0 + 1 * red1) / 7; - case 3: - return (5 * red0 + 2 * red1) / 7; - case 4: - return (4 * red0 + 3 * red1) / 7; - case 5: - return (3 * red0 + 4 * red1) / 7; - case 6: - return (2 * red0 + 5 * red1) / 7; - case 7: - return (1 * red0 + 6 * red1) / 7; - } - } else { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (4 * red0 + 1 * red1) / 5; - case 3: - return (3 * red0 + 2 * red1) / 5; - case 4: - return (2 * red0 + 3 * red1) / 5; - case 5: - return (1 * red0 + 4 * red1) / 5; - case 6: - return 0; - case 7: - return 0xff; - } - } - return 0; -} - -void DecompressBC4(std::span input, Extent3D extent, std::span output) { - UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); - UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); - static constexpr u32 BLOCK_SIZE = 4; - size_t input_offset = 0; - for (u32 slice = 0; slice < extent.depth; ++slice) { - for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { - for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { - u64 bits; - std::memcpy(&bits, &input[input_offset], sizeof(bits)); - input_offset += sizeof(bits); - - for (u32 y = 0; y < BLOCK_SIZE; ++y) { - for (u32 x = 0; x < BLOCK_SIZE; ++x) { - const u32 linear_z = slice; - const u32 linear_y = block_y * BLOCK_SIZE + y; - const u32 linear_x = block_x * BLOCK_SIZE + x; - const u32 offset_z = linear_z * extent.width * extent.height; - const u32 offset_y = linear_y * extent.width; - const u32 offset_x = linear_x; - const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; - const u32 color = DecompressBlock(bits, x, y); - output[output_offset + 0] = static_cast(color); - output[output_offset + 1] = 0; - output[output_offset + 2] = 0; - output[output_offset + 3] = 0xff; - } - } - } - } - } -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc4.h deleted file mode 100644 index ab2f735be..000000000 --- a/src/video_core/texture_cache/decode_bc4.h +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include - -#include "common/common_types.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -void DecompressBC4(std::span data, Extent3D extent, std::span output); - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index f781cb7a0..9a618a57a 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -24,7 +24,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/decode_bc.h" #include "video_core/texture_cache/format_lookup_table.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" @@ -61,8 +61,6 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; -constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); - struct LevelInfo { Extent3D size; Extent3D block; @@ -612,7 +610,8 @@ u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { } return output_size; } - return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * + ConvertedBytesPerBlock(info.format); } u32 CalculateLayerStride(const ImageInfo& info) noexcept { @@ -945,7 +944,8 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span(copy.buffer_size); } else { - DecompressBC4(input_offset, copy.image_extent, output.subspan(output_offset)); - + const Extent3D image_extent{ + .width = copy.image_extent.width, + .height = copy.image_extent.height * copy.image_subresource.num_layers, + .depth = copy.image_extent.depth, + }; + DecompressBCn(input_offset, output.subspan(output_offset), image_extent, info.format); output_offset += copy.image_extent.width * copy.image_extent.height * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + ConvertedBytesPerBlock(info.format); } } } diff --git a/src/video_core/textures/bcn.cpp b/src/video_core/textures/bcn.cpp index 671212a49..16ddbe320 100644 --- a/src/video_core/textures/bcn.cpp +++ b/src/video_core/textures/bcn.cpp @@ -3,7 +3,6 @@ #include #include - #include "common/alignment.h" #include "video_core/textures/bcn.h" #include "video_core/textures/workers.h" diff --git a/src/video_core/textures/bcn.h b/src/video_core/textures/bcn.h index 6464af885..d5d2a16c9 100644 --- a/src/video_core/textures/bcn.h +++ b/src/video_core/textures/bcn.h @@ -4,14 +4,13 @@ #pragma once #include -#include + +#include "common/common_types.h" namespace Tegra::Texture::BCN { -void CompressBC1(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC1(std::span data, u32 width, u32 height, u32 depth, std::span output); -void CompressBC3(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC3(std::span data, u32 width, u32 height, u32 depth, std::span output); } // namespace Tegra::Texture::BCN diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index e05d04db3..1f17265d5 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -293,6 +293,11 @@ public: return features.features.textureCompressionASTC_LDR; } + /// Returns true if BCn is natively supported. + bool IsOptimalBcnSupported() const { + return features.features.textureCompressionBC; + } + /// Returns true if descriptor aliasing is natively supported. bool IsDescriptorAliasingSupported() const { return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; @@ -423,6 +428,11 @@ public: return extensions.sampler_filter_minmax; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return extensions.shader_stencil_export; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return extensions.depth_range_unrestricted; @@ -492,11 +502,6 @@ public: return extensions.vertex_input_dynamic_state; } - /// Returns true if the device supports VK_EXT_shader_stencil_export. - bool IsExtShaderStencilExportSupported() const { - return extensions.shader_stencil_export; - } - /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation bool IsExtShaderDemoteToHelperInvocationSupported() const { return extensions.shader_demote_to_helper_invocation; -- cgit v1.2.3 From b62121fd605663dc9aaaae72fe8e444312f9c5d5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 29 Jun 2023 11:58:45 +0200 Subject: Texture cache: Fix YFC regression due to code testing --- src/video_core/texture_cache/texture_cache.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d3f03a995..485f6b6f3 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -598,14 +598,6 @@ void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz [&](ImageId id, Image&) { deleted_images.push_back(id); }); for (const ImageId id : deleted_images) { Image& image = slot_images[id]; - if (True(image.flags & ImageFlagBits::CpuModified)) { - return; - } - image.flags |= ImageFlagBits::CpuModified; - if (True(image.flags & ImageFlagBits::Tracked)) { - UntrackImage(image, id); - } - /* if (True(image.flags & ImageFlagBits::Remapped)) { continue; } @@ -613,7 +605,6 @@ void TextureCache

::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz if (True(image.flags & ImageFlagBits::Tracked)) { UntrackImage(image, id); } - */ } } -- cgit v1.2.3 From 596a6132b974dd73935854d8f51842424e058be8 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Thu, 29 Jun 2023 17:23:29 +0200 Subject: AccelerateDMA: Don't accelerate 3D texture DMA operations --- src/video_core/texture_cache/texture_cache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/video_core/texture_cache') diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d3f03a995..0330415b7 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -879,6 +879,10 @@ ImageId TextureCache

::DmaImageId(const Tegra::DMA::ImageOperand& operand, boo return NULL_IMAGE_ID; } auto& image = slot_images[image_id]; + if (image.info.type == ImageType::e3D) { + // Don't accelerate 3D images. + return NULL_IMAGE_ID; + } if (!is_upload && !image.info.dma_downloaded) { // Force a full sync. image.info.dma_downloaded = true; -- cgit v1.2.3