diff options
Diffstat (limited to 'src/video_core')
26 files changed, 361 insertions, 246 deletions
diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index d774db107..b60f86260 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -144,7 +144,7 @@ void CDmaPusher::ExecuteCommand(u32 offset, u32 data) { } case ThiMethod::SetMethod1: LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", - static_cast<u32>(vic_thi_state.method_0)); + static_cast<u32>(vic_thi_state.method_0), data); vic_processor->ProcessMethod(static_cast<Tegra::Vic::Method>(vic_thi_state.method_0), {data}); break; diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp index 2df410be8..1adf3cd13 100644 --- a/src/video_core/command_classes/codecs/codec.cpp +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -4,6 +4,7 @@ #include <cstring> #include <fstream> +#include <vector> #include "common/assert.h" #include "video_core/command_classes/codecs/codec.h" #include "video_core/command_classes/codecs/h264.h" diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h index 2e56daf29..5bbe6a332 100644 --- a/src/video_core/command_classes/codecs/codec.h +++ b/src/video_core/command_classes/codecs/codec.h @@ -5,8 +5,6 @@ #pragma once #include <memory> -#include <vector> -#include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/command_classes/nvdec_common.h" @@ -44,11 +42,11 @@ public: void Decode(); /// Returns most recently decoded frame - AVFrame* GetCurrentFrame(); - const AVFrame* GetCurrentFrame() const; + [[nodiscard]] AVFrame* GetCurrentFrame(); + [[nodiscard]] const AVFrame* GetCurrentFrame() const; /// Returns the value of current_codec - NvdecCommon::VideoCodec GetCurrentCodec() const; + [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const; private: bool initialized{}; diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp index 1a39f7b23..33e063e20 100644 --- a/src/video_core/command_classes/codecs/h264.cpp +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -18,17 +18,33 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // +#include <array> #include "common/bit_util.h" #include "video_core/command_classes/codecs/h264.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" namespace Tegra::Decoder { +namespace { +// ZigZag LUTs from libavcodec. +constexpr std::array<u8, 64> zig_zag_direct{ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, + 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +constexpr std::array<u8, 16> zig_zag_scan{ + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; +} // Anonymous namespace + H264::H264(GPU& gpu_) : gpu(gpu_) {} H264::~H264() = default; -std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) { +const std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { H264DecoderContext context{}; gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); @@ -48,7 +64,8 @@ std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bo writer.WriteU(0, 8); writer.WriteU(31, 8); writer.WriteUe(0); - const s32 chroma_format_idc = (context.h264_parameter_set.flags >> 12) & 0x3; + const auto chroma_format_idc = + static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3); writer.WriteUe(chroma_format_idc); if (chroma_format_idc == 3) { writer.WriteBit(false); @@ -59,8 +76,8 @@ std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bo writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag writer.WriteBit(false); // Scaling matrix present flag - const s32 order_cnt_type = static_cast<s32>((context.h264_parameter_set.flags >> 14) & 3); - writer.WriteUe(static_cast<s32>((context.h264_parameter_set.flags >> 8) & 0xf)); + const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3); + writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf)); writer.WriteUe(order_cnt_type); if (order_cnt_type == 0) { writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt); @@ -100,7 +117,7 @@ std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bo writer.WriteUe(0); writer.WriteUe(0); - writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag); + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); writer.WriteBit(false); writer.WriteUe(0); writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); @@ -172,8 +189,8 @@ void H264BitWriter::WriteSe(s32 value) { WriteExpGolombCodedInt(value); } -void H264BitWriter::WriteUe(s32 value) { - WriteExpGolombCodedUInt((u32)value); +void H264BitWriter::WriteUe(u32 value) { + WriteExpGolombCodedUInt(value); } void H264BitWriter::End() { diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h index 21752dd90..273449495 100644 --- a/src/video_core/command_classes/codecs/h264.h +++ b/src/video_core/command_classes/codecs/h264.h @@ -38,7 +38,7 @@ public: /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax void WriteU(s32 value, s32 value_sz); void WriteSe(s32 value); - void WriteUe(s32 value); + void WriteUe(u32 value); /// Finalize the bitstream void End(); @@ -51,26 +51,14 @@ public: void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count); /// Return the bitstream as a vector. - std::vector<u8>& GetByteArray(); - const std::vector<u8>& GetByteArray() const; + [[nodiscard]] std::vector<u8>& GetByteArray(); + [[nodiscard]] const std::vector<u8>& GetByteArray() const; private: - // ZigZag LUTs from libavcodec. - static constexpr std::array<u8, 64> zig_zag_direct{ - 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, - 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, - 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, - }; - - static constexpr std::array<u8, 16> zig_zag_scan{ - 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, - 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, - }; - void WriteBits(s32 value, s32 bit_count); void WriteExpGolombCodedInt(s32 value); void WriteExpGolombCodedUInt(u32 value); - s32 GetFreeBufferBits(); + [[nodiscard]] s32 GetFreeBufferBits(); void Flush(); s32 buffer_size{8}; @@ -86,8 +74,8 @@ public: ~H264(); /// Compose the H264 header of the frame for FFmpeg decoding - std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, - bool is_first_frame = false); + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, + bool is_first_frame = false); private: struct H264ParameterSet { diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp index d205a8f5d..ab44fdc9e 100644 --- a/src/video_core/command_classes/codecs/vp9.cpp +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -9,7 +9,7 @@ #include "video_core/memory_manager.h" namespace Tegra::Decoder { - +namespace { // Default compressed header probabilities once frame context resets constexpr Vp9EntropyProbs default_probs{ .y_mode_prob{ @@ -170,6 +170,89 @@ constexpr Vp9EntropyProbs default_probs{ .high_precision{128, 128}, }; +constexpr std::array<s32, 256> norm_lut{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +constexpr std::array<s32, 254> map_lut{ + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 19, +}; + +// 6.2.14 Tile size calculation + +[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 min_log2 = 0; + + while ((64 << min_log2) < sb64_cols) { + min_log2++; + } + + return min_log2; +} + +[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 max_log2 = 1; + + while ((sb64_cols >> max_log2) >= 4) { + max_log2++; + } + + return max_log2 - 1; +} + +// Recenters probability. Based on section 6.3.6 of VP9 Specification +[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { + if (new_prob > old_prob * 2) { + return new_prob; + } + + if (new_prob >= old_prob) { + return (new_prob - old_prob) * 2; + } + + return (old_prob - new_prob) * 2 - 1; +} + +// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { + new_prob--; + old_prob--; + + std::size_t index{}; + + if (old_prob * 2 <= 0xff) { + index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); + } else { + index = static_cast<std::size_t>( + std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); + } + + return map_lut[index]; +} +} // Anonymous namespace + VP9::VP9(GPU& gpu) : gpu(gpu) {} VP9::~VP9() = default; @@ -207,32 +290,6 @@ void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_pro EncodeTermSubExp(writer, delta); } -s32 VP9::RemapProbability(s32 new_prob, s32 old_prob) { - new_prob--; - old_prob--; - - std::size_t index{}; - - if (old_prob * 2 <= 0xff) { - index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); - } else { - index = static_cast<std::size_t>( - std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); - } - - return map_lut[index]; -} - -s32 VP9::RecenterNonNeg(s32 new_prob, s32 old_prob) { - if (new_prob > old_prob * 2) { - return new_prob; - } else if (new_prob >= old_prob) { - return (new_prob - old_prob) * 2; - } else { - return (old_prob - new_prob) * 2 - 1; - } -} - void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { if (WriteLessThan(writer, value, 16)) { writer.Write(value, 4); @@ -332,28 +389,6 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_ } } -s32 VP9::CalcMinLog2TileCols(s32 frame_width) { - const s32 sb64_cols = (frame_width + 63) / 64; - s32 min_log2 = 0; - - while ((64 << min_log2) < sb64_cols) { - min_log2++; - } - - return min_log2; -} - -s32 VP9::CalcMaxLog2TileCols(s32 frameWidth) { - const s32 sb64_cols = (frameWidth + 63) / 64; - s32 max_log2 = 1; - - while ((sb64_cols >> max_log2) >= 4) { - max_log2++; - } - - return max_log2 - 1; -} - Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { PictureInfo picture_info{}; gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); @@ -379,14 +414,14 @@ Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) Vp9FrameContainer frame{}; { gpu.SyncGuestHost(); - frame.info = std::move(GetVp9PictureInfo(state)); + frame.info = GetVp9PictureInfo(state); frame.bit_stream.resize(frame.info.bitstream_size); gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.bit_stream.data(), frame.info.bitstream_size); } // Buffer two frames, saving the last show frame info - if (next_next_frame.bit_stream.size() != 0) { + if (!next_next_frame.bit_stream.empty()) { Vp9FrameContainer temp{ .info = frame.info, .bit_stream = frame.bit_stream, @@ -396,15 +431,15 @@ Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) frame.bit_stream = next_next_frame.bit_stream; next_next_frame = std::move(temp); - if (next_frame.bit_stream.size() != 0) { - Vp9FrameContainer temp{ + if (!next_frame.bit_stream.empty()) { + Vp9FrameContainer temp2{ .info = frame.info, .bit_stream = frame.bit_stream, }; next_frame.info.show_frame = frame.info.last_frame_shown; frame.info = next_frame.info; frame.bit_stream = next_frame.bit_stream; - next_frame = std::move(temp); + next_frame = std::move(temp2); } else { next_frame.info = frame.info; next_frame.bit_stream = frame.bit_stream; @@ -605,12 +640,6 @@ std::vector<u8> VP9::ComposeCompressedHeader() { writer.End(); return writer.GetBuffer(); - - const auto writer_bytearray = writer.GetBuffer(); - - std::vector<u8> compressed_header(writer_bytearray.size()); - std::memcpy(compressed_header.data(), writer_bytearray.data(), writer_bytearray.size()); - return compressed_header; } VpxBitStreamWriter VP9::ComposeUncompressedHeader() { @@ -648,7 +677,6 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { current_frame_info.intra_only = true; } else { - std::array<s32, 3> ref_frame_index; if (!current_frame_info.show_frame) { uncomp_writer.WriteBit(current_frame_info.intra_only); @@ -663,9 +691,9 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { } // Last, Golden, Altref frames - ref_frame_index = std::array<s32, 3>{0, 1, 2}; + std::array<s32, 3> ref_frame_index{0, 1, 2}; - // set when next frame is hidden + // Set when next frame is hidden // altref and golden references are swapped if (swap_next_golden) { ref_frame_index = std::array<s32, 3>{0, 2, 1}; @@ -754,17 +782,19 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { const s8 old_deltas = loop_filter_ref_deltas[index]; const s8 new_deltas = current_frame_info.ref_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; - loop_filter_delta_update |= - (update_loop_filter_ref_deltas[index] = old_deltas != new_deltas); + update_loop_filter_ref_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; } for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { const s8 old_deltas = loop_filter_mode_deltas[index]; const s8 new_deltas = current_frame_info.mode_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; - loop_filter_delta_update |= - (update_loop_filter_mode_deltas[index] = old_deltas != new_deltas); + update_loop_filter_mode_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; } uncomp_writer.WriteBit(loop_filter_delta_update); @@ -824,12 +854,12 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { return uncomp_writer; } -std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) { +const std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) { std::vector<u8> bitstream; { Vp9FrameContainer curr_frame = GetCurrentFrame(state); current_frame_info = curr_frame.info; - bitstream = curr_frame.bit_stream; + bitstream = std::move(curr_frame.bit_stream); } // The uncompressed header routine sets PrevProb parameters needed for the compressed header diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h index 748e11bae..e2504512c 100644 --- a/src/video_core/command_classes/codecs/vp9.h +++ b/src/video_core/command_classes/codecs/vp9.h @@ -4,9 +4,9 @@ #pragma once -#include <unordered_map> +#include <array> #include <vector> -#include "common/common_funcs.h" + #include "common/common_types.h" #include "common/stream.h" #include "video_core/command_classes/codecs/vp9_types.h" @@ -25,6 +25,12 @@ public: VpxRangeEncoder(); ~VpxRangeEncoder(); + VpxRangeEncoder(const VpxRangeEncoder&) = delete; + VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; + + VpxRangeEncoder(VpxRangeEncoder&&) = default; + VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; + /// Writes the rightmost value_size bits from value into the stream void Write(s32 value, s32 value_size); @@ -37,11 +43,11 @@ public: /// Signal the end of the bitstream void End(); - std::vector<u8>& GetBuffer() { + [[nodiscard]] std::vector<u8>& GetBuffer() { return base_stream.GetBuffer(); } - const std::vector<u8>& GetBuffer() const { + [[nodiscard]] const std::vector<u8>& GetBuffer() const { return base_stream.GetBuffer(); } @@ -52,17 +58,6 @@ private: u32 range{0xff}; s32 count{-24}; s32 half_probability{128}; - static constexpr std::array<s32, 256> norm_lut{ - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; }; class VpxBitStreamWriter { @@ -70,6 +65,12 @@ public: VpxBitStreamWriter(); ~VpxBitStreamWriter(); + VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; + VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; + + VpxBitStreamWriter(VpxBitStreamWriter&&) = default; + VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; + /// Write an unsigned integer value void WriteU(u32 value, u32 value_size); @@ -86,10 +87,10 @@ public: void Flush(); /// Returns byte_array - std::vector<u8>& GetByteArray(); + [[nodiscard]] std::vector<u8>& GetByteArray(); /// Returns const byte_array - const std::vector<u8>& GetByteArray() const; + [[nodiscard]] const std::vector<u8>& GetByteArray() const; private: /// Write bit_count bits from value into buffer @@ -110,12 +111,18 @@ public: explicit VP9(GPU& gpu); ~VP9(); + VP9(const VP9&) = delete; + VP9& operator=(const VP9&) = delete; + + VP9(VP9&&) = default; + VP9& operator=(VP9&&) = delete; + /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec /// documentation - std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state); + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state); /// Returns true if the most recent frame was a hidden frame. - bool WasFrameHidden() const { + [[nodiscard]] bool WasFrameHidden() const { return hidden; } @@ -132,12 +139,6 @@ private: /// Generates compressed header probability deltas in the bitstream writer void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - /// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification - s32 RemapProbability(s32 new_prob, s32 old_prob); - - /// Recenters probability. Based on section 6.3.6 of VP9 Specification - s32 RecenterNonNeg(s32 new_prob, s32 old_prob); - /// Inverse of 6.3.4 Decode term subexp void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); @@ -157,22 +158,18 @@ private: /// Write motion vector probability updates. 6.3.17 in the spec void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); - /// 6.2.14 Tile size calculation - s32 CalcMinLog2TileCols(s32 frame_width); - s32 CalcMaxLog2TileCols(s32 frame_width); - /// Returns VP9 information from NVDEC provided offset and size - Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); /// Returns frame to be decoded after buffering - Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); /// Use NVDEC providied information to compose the headers for the current frame - std::vector<u8> ComposeCompressedHeader(); - VpxBitStreamWriter ComposeUncompressedHeader(); + [[nodiscard]] std::vector<u8> ComposeCompressedHeader(); + [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); GPU& gpu; std::vector<u8> frame; @@ -180,7 +177,7 @@ private: std::array<s8, 4> loop_filter_ref_deltas{}; std::array<s8, 2> loop_filter_mode_deltas{}; - bool hidden; + bool hidden = false; s64 current_frame_number = -2; // since we buffer 2 frames s32 grace_period = 6; // frame offsets need to stabilize std::array<FrameContexts, 4> frame_ctxs{}; @@ -193,23 +190,6 @@ private: s32 diff_update_probability = 252; s32 frame_sync_code = 0x498342; - static constexpr std::array<s32, 254> map_lut = { - 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, - 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, - 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, - 67, 68, 69, 70, 71, 72, 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, - 83, 84, 85, 5, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, - 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, - 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, - 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, - 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, 160, - 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, - 177, 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, - 193, 14, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, - 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, - 224, 225, 226, 227, 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, - 240, 241, 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19, - }; }; } // namespace Decoder diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h index 8688fdac0..4f0b05d22 100644 --- a/src/video_core/command_classes/codecs/vp9_types.h +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -4,13 +4,11 @@ #pragma once -#include <algorithm> -#include <list> +#include <array> +#include <cstring> #include <vector> -#include "common/cityhash.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "video_core/command_classes/nvdec_common.h" namespace Tegra { class GPU; @@ -233,9 +231,8 @@ struct PictureInfo { u32 surface_params{}; INSERT_PADDING_WORDS(3); - Vp9PictureInfo Convert() const { - - return Vp9PictureInfo{ + [[nodiscard]] Vp9PictureInfo Convert() const { + return { .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp index a5234ee47..c4dd4881a 100644 --- a/src/video_core/command_classes/host1x.cpp +++ b/src/video_core/command_classes/host1x.cpp @@ -15,7 +15,7 @@ void Tegra::Host1x::StateWrite(u32 offset, u32 arguments) { std::memcpy(state_offset, &arguments, sizeof(u32)); } -void Tegra::Host1x::ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments) { +void Tegra::Host1x::ProcessMethod(Method method, const std::vector<u32>& arguments) { StateWrite(static_cast<u32>(method), arguments[0]); switch (method) { case Method::WaitSyncpt: diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h index 501a5ed2e..013eaa0c1 100644 --- a/src/video_core/command_classes/host1x.h +++ b/src/video_core/command_classes/host1x.h @@ -61,7 +61,7 @@ public: ~Host1x(); /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(Host1x::Method method, const std::vector<u32>& arguments); + void ProcessMethod(Method method, const std::vector<u32>& arguments); private: /// For Host1x, execute is waiting on a syncpoint previously written into the state diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp index ede9466eb..8ca7a7b06 100644 --- a/src/video_core/command_classes/nvdec.cpp +++ b/src/video_core/command_classes/nvdec.cpp @@ -2,13 +2,9 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <bitset> #include "common/assert.h" -#include "common/bit_util.h" -#include "core/memory.h" #include "video_core/command_classes/nvdec.h" #include "video_core/gpu.h" -#include "video_core/memory_manager.h" namespace Tegra { @@ -16,7 +12,7 @@ Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} Nvdec::~Nvdec() = default; -void Nvdec::ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments) { +void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { if (method == Method::SetVideoCodec) { codec->StateWrite(static_cast<u32>(method), arguments[0]); } else { diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h index c1a9d843e..eec4443f9 100644 --- a/src/video_core/command_classes/nvdec.h +++ b/src/video_core/command_classes/nvdec.h @@ -4,8 +4,8 @@ #pragma once +#include <memory> #include <vector> -#include "common/common_funcs.h" #include "common/common_types.h" #include "video_core/command_classes/codecs/codec.h" @@ -23,17 +23,17 @@ public: ~Nvdec(); /// Writes the method into the state, Invoke Execute() if encountered - void ProcessMethod(Nvdec::Method method, const std::vector<u32>& arguments); + void ProcessMethod(Method method, const std::vector<u32>& arguments); /// Return most recently decoded frame - AVFrame* GetFrame(); - const AVFrame* GetFrame() const; + [[nodiscard]] AVFrame* GetFrame(); + [[nodiscard]] const AVFrame* GetFrame() const; private: /// Invoke codec to decode a frame void Execute(); GPU& gpu; - std::unique_ptr<Tegra::Codec> codec; + std::unique_ptr<Codec> codec; }; } // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp index a0ab44855..19dc9e0ab 100644 --- a/src/video_core/command_classes/sync_manager.cpp +++ b/src/video_core/command_classes/sync_manager.cpp @@ -27,22 +27,22 @@ SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} SyncptIncrManager::~SyncptIncrManager() = default; void SyncptIncrManager::Increment(u32 id) { - increments.push_back(SyncptIncr{0, id, true}); + increments.emplace_back(0, 0, id, true); IncrementAllDone(); } u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { const u32 handle = current_id++; - increments.push_back(SyncptIncr{handle, class_id, id}); + increments.emplace_back(handle, class_id, id); return handle; } void SyncptIncrManager::SignalDone(u32 handle) { - auto done_incr = std::find_if(increments.begin(), increments.end(), - [handle](SyncptIncr incr) { return incr.id == handle; }); - if (done_incr != increments.end()) { - const SyncptIncr incr = *done_incr; - *done_incr = SyncptIncr{incr.id, incr.class_id, incr.syncpt_id, true}; + const auto done_incr = + std::find_if(increments.begin(), increments.end(), + [handle](const SyncptIncr& incr) { return incr.id == handle; }); + if (done_incr != increments.cend()) { + done_incr->complete = true; } IncrementAllDone(); } diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h index 353b67573..2c321ec58 100644 --- a/src/video_core/command_classes/sync_manager.h +++ b/src/video_core/command_classes/sync_manager.h @@ -32,8 +32,8 @@ struct SyncptIncr { u32 syncpt_id; bool complete; - SyncptIncr(u32 id, u32 syncpt_id_, u32 class_id_, bool done = false) - : id(id), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} + SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) + : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} }; class SyncptIncrManager { diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 66e15a1a8..5b52da277 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -26,7 +26,7 @@ void Vic::VicStateWrite(u32 offset, u32 arguments) { std::memcpy(state_offset, &arguments, sizeof(u32)); } -void Vic::ProcessMethod(Vic::Method method, const std::vector<u32>& arguments) { +void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", static_cast<u32>(method)); VicStateWrite(static_cast<u32>(method), arguments[0]); const u64 arg = static_cast<u64>(arguments[0]) << 8; diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index dd0a2aed8..8c4e284a1 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h @@ -63,11 +63,11 @@ public: SetOutputSurfaceChromaVOffset = 0x1ca }; - explicit Vic(GPU& gpu, std::shared_ptr<Tegra::Nvdec> nvdec_processor); + explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); ~Vic(); /// Write to the device state. - void ProcessMethod(Vic::Method method, const std::vector<u32>& arguments); + void ProcessMethod(Method method, const std::vector<u32>& arguments); private: void Execute(); diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index f2f96ac33..105b85a92 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -12,6 +13,20 @@ namespace Tegra { +void CommandList::RefreshIntegrityChecks(GPU& gpu) { + command_list_hashes.resize(command_lists.size()); + + for (std::size_t index = 0; index < command_lists.size(); ++index) { + const CommandListHeader command_list_header = command_lists[index]; + std::vector<CommandHeader> command_headers(command_list_header.size); + gpu.MemoryManager().ReadBlockUnsafe(command_list_header.addr, command_headers.data(), + command_list_header.size * sizeof(u32)); + command_list_hashes[index] = + Common::CityHash64(reinterpret_cast<char*>(command_headers.data()), + command_list_header.size * sizeof(u32)); + } +} + DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {} DmaPusher::~DmaPusher() = default; @@ -45,32 +60,51 @@ bool DmaPusher::Step() { return false; } - const CommandList& command_list{dma_pushbuffer.front()}; - ASSERT_OR_EXECUTE(!command_list.empty(), { - // Somehow the command_list is empty, in order to avoid a crash - // We ignore it and assume its size is 0. - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - return true; - }); - const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]}; - const GPUVAddr dma_get = command_list_header.addr; - - if (dma_pushbuffer_subindex >= command_list.size()) { - // We've gone through the current list, remove it from the queue - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - } + CommandList& command_list{dma_pushbuffer.front()}; - if (command_list_header.size == 0) { - return true; - } + ASSERT_OR_EXECUTE( + command_list.command_lists.size() || command_list.prefetch_command_list.size(), { + // Somehow the command_list is empty, in order to avoid a crash + // We ignore it and assume its size is 0. + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + return true; + }); - // Push buffer non-empty, read a word - command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + if (command_list.prefetch_command_list.size()) { + // Prefetched command list from nvdrv, used for things like synchronization + command_headers = std::move(command_list.prefetch_command_list); + dma_pushbuffer.pop(); + } else { + const CommandListHeader command_list_header{ + command_list.command_lists[dma_pushbuffer_subindex]}; + const u64 next_hash = command_list.command_list_hashes[dma_pushbuffer_subindex++]; + const GPUVAddr dma_get = command_list_header.addr; + + if (dma_pushbuffer_subindex >= command_list.command_lists.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } + if (command_list_header.size == 0) { + return true; + } + + // Push buffer non-empty, read a word + command_headers.resize(command_list_header.size); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); + + // Integrity check + const u64 new_hash = Common::CityHash64(reinterpret_cast<char*>(command_headers.data()), + command_list_header.size * sizeof(u32)); + if (new_hash != next_hash) { + LOG_CRITICAL(HW_GPU, "CommandList at addr=0x{:X} is corrupt, skipping!", dma_get); + dma_pushbuffer.pop(); + return true; + } + } for (std::size_t index = 0; index < command_headers.size();) { const CommandHeader& command_header = command_headers[index]; diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index efa90d170..8496ba2da 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -27,6 +27,31 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence +// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. +// So the values you see in docs might be multiplied by 4. +enum class BufferMethods : u32 { + BindObject = 0x0, + Nop = 0x2, + SemaphoreAddressHigh = 0x4, + SemaphoreAddressLow = 0x5, + SemaphoreSequence = 0x6, + SemaphoreTrigger = 0x7, + NotifyIntr = 0x8, + WrcacheFlush = 0x9, + Unk28 = 0xA, + UnkCacheFlush = 0xB, + RefCnt = 0x14, + SemaphoreAcquire = 0x1A, + SemaphoreRelease = 0x1B, + FenceValue = 0x1C, + FenceAction = 0x1D, + WaitForInterrupt = 0x1E, + Unk7c = 0x1F, + Yield = 0x20, + NonPullerMethods = 0x40, +}; + struct CommandListHeader { union { u64 raw; @@ -49,9 +74,29 @@ union CommandHeader { static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); +static constexpr CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, + SubmissionMode mode) { + CommandHeader result{}; + result.method.Assign(static_cast<u32>(method)); + result.arg_count.Assign(arg_count); + result.mode.Assign(mode); + return result; +} + class GPU; -using CommandList = std::vector<Tegra::CommandListHeader>; +struct CommandList final { + CommandList() = default; + explicit CommandList(std::size_t size) : command_lists(size) {} + explicit CommandList(std::vector<Tegra::CommandHeader>&& prefetch_command_list) + : prefetch_command_list{std::move(prefetch_command_list)} {} + + void RefreshIntegrityChecks(GPU& gpu); + + std::vector<Tegra::CommandListHeader> command_lists; + std::vector<u64> command_list_hashes; + std::vector<Tegra::CommandHeader> prefetch_command_list; +}; /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the @@ -60,7 +105,7 @@ using CommandList = std::vector<Tegra::CommandListHeader>; * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for * details on this implementation. */ -class DmaPusher { +class DmaPusher final { public: explicit DmaPusher(Core::System& system, GPU& gpu); ~DmaPusher(); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index d374b73cf..a3c05d1b0 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -1893,6 +1893,7 @@ public: ICMP_IMM, FCMP_RR, FCMP_RC, + FCMP_IMMR, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -2205,6 +2206,7 @@ private: INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"), INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"), INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"), + INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 171f78183..ebd149c3a 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -194,30 +194,6 @@ void GPU::SyncGuestHost() { void GPU::OnCommandListEnd() { renderer->Rasterizer().ReleaseFences(); } -// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence -// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. -// So the values you see in docs might be multiplied by 4. -enum class BufferMethods { - BindObject = 0x0, - Nop = 0x2, - SemaphoreAddressHigh = 0x4, - SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, - WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, - RefCnt = 0x14, - SemaphoreAcquire = 0x1A, - SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - Unk78 = 0x1E, - Unk7c = 0x1F, - Yield = 0x20, - NonPullerMethods = 0x40, -}; enum class GpuSemaphoreOperation { AcquireEqual = 0x1, @@ -277,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::UnkCacheFlush: case BufferMethods::WrcacheFlush: case BufferMethods::FenceValue: + break; case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -391,6 +372,25 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { } } +void GPU::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", + static_cast<u32>(regs.fence_action.op.Value())); + } +} + +void GPU::ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); +} + void GPU::ProcessSemaphoreTriggerMethod() { const auto semaphoreOperationMask = 0xF; const auto op = diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index b8c613b11..5444b49f3 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -263,6 +263,24 @@ public: return use_nvdec; } + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + + static constexpr CommandHeader Build(FenceOperation op, u32 syncpoint_id) { + FenceAction result{}; + result.op.Assign(op); + result.syncpoint_id.Assign(syncpoint_id); + return {result.raw}; + } + }; + struct Regs { static constexpr size_t NUM_REGS = 0x40; @@ -291,10 +309,7 @@ public: u32 semaphore_acquire; u32 semaphore_release; u32 fence_value; - union { - BitField<4, 4, u32> operation; - BitField<8, 8, u32> id; - } fence_action; + FenceAction fence_action; INSERT_UNION_PADDING_WORDS(0xE2); // Puller state @@ -342,6 +357,8 @@ protected: private: void ProcessBindMethod(const MethodCall& method_call); + void ProcessFenceActionMethod(); + void ProcessWaitForInterruptMethod(); void ProcessSemaphoreTriggerMethod(); void ProcessSemaphoreRelease(); void ProcessSemaphoreAcquire(); diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index e1217ca83..f34ed6735 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -771,13 +771,18 @@ void VKDevice::CollectTelemetryParameters() { VkPhysicalDeviceDriverPropertiesKHR driver{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR, .pNext = nullptr, + .driverID = {}, + .driverName = {}, + .driverInfo = {}, + .conformanceVersion = {}, }; - VkPhysicalDeviceProperties2KHR properties{ + VkPhysicalDeviceProperties2KHR device_properties{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, .pNext = &driver, + .properties = {}, }; - physical.GetProperties2KHR(properties); + physical.GetProperties2KHR(device_properties); driver_id = driver.driverID; vendor_name = driver.driverName; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 696eaeb5f..0e8f9c352 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -159,6 +159,7 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, + .codeSize = 0, }; std::vector<vk::ShaderModule> modules; @@ -388,6 +389,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .logicOp = VK_LOGIC_OP_COPY, .attachmentCount = static_cast<u32>(num_attachments), .pAttachments = cb_attachments.data(), + .blendConstants = {}, }; std::vector dynamic_states{ diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp index aabd62c5c..39cc3b869 100644 --- a/src/video_core/shader/async_shaders.cpp +++ b/src/video_core/shader/async_shaders.cpp @@ -20,14 +20,15 @@ AsyncShaders::~AsyncShaders() { } void AsyncShaders::AllocateWorkers() { - // Max worker threads we should allow - constexpr u32 MAX_THREADS = 4; - // Deduce how many threads we can use - const u32 threads_used = std::thread::hardware_concurrency() / 4; - // Always allow at least 1 thread regardless of our settings - const auto max_worker_count = std::max(1U, threads_used); - // Don't use more than MAX_THREADS - const auto num_workers = std::min(max_worker_count, MAX_THREADS); + // Use at least one thread + u32 num_workers = 1; + + // Deduce how many more threads we can use + const u32 thread_count = std::thread::hardware_concurrency(); + if (thread_count >= 8) { + // Increase async workers by 1 for every 2 threads >= 8 + num_workers += 1 + (thread_count - 8) / 2; + } // If we already have workers queued, ignore if (num_workers == worker_threads.size()) { diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 4db329fa5..afef5948d 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -137,7 +137,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::FCMP_RR: - case OpCode::Id::FCMP_RC: { + case OpCode::Id::FCMP_RC: + case OpCode::Id::FCMP_IMMR: { UNIMPLEMENTED_IF(instr.fcmp.ftz == 0); Node op_c = GetRegister(instr.gpr39); Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f)); diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index e8515321b..13dd16356 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -240,6 +240,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface( .is_tiled = is_tiled, .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB, + .is_layered = false, .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U, .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U, .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U, |
