aboutsummaryrefslogtreecommitdiff
path: root/src/video_core
diff options
context:
space:
mode:
Diffstat (limited to 'src/video_core')
-rw-r--r--src/video_core/CMakeLists.txt9
-rw-r--r--src/video_core/buffer_cache/buffer_block.h27
-rw-r--r--src/video_core/buffer_cache/buffer_cache.h232
-rw-r--r--src/video_core/compatible_formats.cpp162
-rw-r--r--src/video_core/compatible_formats.h32
-rw-r--r--src/video_core/engines/const_buffer_engine_interface.h1
-rw-r--r--src/video_core/engines/kepler_compute.cpp5
-rw-r--r--src/video_core/engines/kepler_compute.h2
-rw-r--r--src/video_core/engines/maxwell_3d.cpp7
-rw-r--r--src/video_core/engines/maxwell_3d.h11
-rw-r--r--src/video_core/engines/shader_bytecode.h8
-rw-r--r--src/video_core/gpu.cpp5
-rw-r--r--src/video_core/gpu.h6
-rw-r--r--src/video_core/gpu_asynch.cpp9
-rw-r--r--src/video_core/gpu_asynch.h2
-rw-r--r--src/video_core/gpu_synch.cpp8
-rw-r--r--src/video_core/gpu_synch.h2
-rw-r--r--src/video_core/gpu_thread.cpp7
-rw-r--r--src/video_core/macro/macro.cpp60
-rw-r--r--src/video_core/macro/macro.h19
-rw-r--r--src/video_core/macro/macro_hle.cpp113
-rw-r--r--src/video_core/macro/macro_hle.h44
-rw-r--r--src/video_core/macro/macro_interpreter.cpp3
-rw-r--r--src/video_core/macro/macro_jit_x64.cpp137
-rw-r--r--src/video_core/macro/macro_jit_x64.h10
-rw-r--r--src/video_core/memory_manager.cpp40
-rw-r--r--src/video_core/memory_manager.h12
-rw-r--r--src/video_core/query_cache.h10
-rw-r--r--src/video_core/rasterizer_cache.cpp7
-rw-r--r--src/video_core/rasterizer_cache.h253
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.cpp2073
-rw-r--r--src/video_core/renderer_opengl/gl_arb_decompiler.h29
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.cpp71
-rw-r--r--src/video_core/renderer_opengl/gl_buffer_cache.h49
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp34
-rw-r--r--src/video_core/renderer_opengl/gl_device.h15
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp211
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h21
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp99
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h52
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp50
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.cpp64
-rw-r--r--src/video_core/renderer_opengl/gl_shader_disk_cache.h1
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.cpp64
-rw-r--r--src/video_core/renderer_opengl/gl_stream_buffer.h25
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.cpp53
-rw-r--r--src/video_core/renderer_opengl/gl_texture_cache.h6
-rw-r--r--src/video_core/renderer_opengl/maxwell_to_gl.h114
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.cpp17
-rw-r--r--src/video_core/renderer_opengl/renderer_opengl.h3
-rw-r--r--src/video_core/renderer_vulkan/maxwell_to_vk.cpp176
-rw-r--r--src/video_core/renderer_vulkan/renderer_vulkan.cpp32
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.cpp97
-rw-r--r--src/video_core/renderer_vulkan/vk_buffer_cache.h42
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.cpp72
-rw-r--r--src/video_core/renderer_vulkan/vk_pipeline_cache.h33
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.cpp122
-rw-r--r--src/video_core/renderer_vulkan/vk_rasterizer.h7
-rw-r--r--src/video_core/renderer_vulkan/vk_sampler_cache.cpp6
-rw-r--r--src/video_core/renderer_vulkan/vk_scheduler.cpp2
-rw-r--r--src/video_core/renderer_vulkan/vk_stream_buffer.h6
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.cpp76
-rw-r--r--src/video_core/renderer_vulkan/vk_texture_cache.h33
-rw-r--r--src/video_core/renderer_vulkan/vk_update_descriptor.cpp36
-rw-r--r--src/video_core/renderer_vulkan/vk_update_descriptor.h32
-rw-r--r--src/video_core/renderer_vulkan/wrapper.cpp19
-rw-r--r--src/video_core/renderer_vulkan/wrapper.h6
-rw-r--r--src/video_core/shader/decode/half_set.cpp88
-rw-r--r--src/video_core/shader/decode/image.cpp26
-rw-r--r--src/video_core/shader/decode/texture.cpp55
-rw-r--r--src/video_core/shader/memory_util.cpp4
-rw-r--r--src/video_core/shader/node.h75
-rw-r--r--src/video_core/shader/node_helper.h2
-rw-r--r--src/video_core/shader/registry.cpp20
-rw-r--r--src/video_core/shader/registry.h35
-rw-r--r--src/video_core/shader/shader_ir.h14
-rw-r--r--src/video_core/shader/track.cpp78
-rw-r--r--src/video_core/shader_cache.h228
-rw-r--r--src/video_core/texture_cache/surface_base.cpp10
-rw-r--r--src/video_core/texture_cache/surface_base.h13
-rw-r--r--src/video_core/texture_cache/surface_params.cpp19
-rw-r--r--src/video_core/texture_cache/texture_cache.h144
82 files changed, 4462 insertions, 1440 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 2bf8d68ce..21c46a567 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -3,6 +3,8 @@ add_library(video_core STATIC
buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp
buffer_cache/map_interval.h
+ compatible_formats.cpp
+ compatible_formats.h
dirty_flags.cpp
dirty_flags.h
dma_pusher.cpp
@@ -27,6 +29,8 @@ add_library(video_core STATIC
engines/shader_type.h
macro/macro.cpp
macro/macro.h
+ macro/macro_hle.cpp
+ macro/macro_hle.h
macro/macro_interpreter.cpp
macro/macro_interpreter.h
macro/macro_jit_x64.cpp
@@ -49,11 +53,11 @@ add_library(video_core STATIC
query_cache.h
rasterizer_accelerated.cpp
rasterizer_accelerated.h
- rasterizer_cache.cpp
- rasterizer_cache.h
rasterizer_interface.h
renderer_base.cpp
renderer_base.h
+ renderer_opengl/gl_arb_decompiler.cpp
+ renderer_opengl/gl_arb_decompiler.h
renderer_opengl/gl_buffer_cache.cpp
renderer_opengl/gl_buffer_cache.h
renderer_opengl/gl_device.cpp
@@ -93,6 +97,7 @@ add_library(video_core STATIC
renderer_opengl/utils.h
sampler_cache.cpp
sampler_cache.h
+ shader_cache.h
shader/decode/arithmetic.cpp
shader/decode/arithmetic_immediate.cpp
shader/decode/bfe.cpp
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
index e35ee0b67..e64170e66 100644
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {
class BufferBlock {
public:
- bool Overlaps(const VAddr start, const VAddr end) const {
+ bool Overlaps(VAddr start, VAddr end) const {
return (cpu_addr < end) && (cpu_addr_end > start);
}
- bool IsInside(const VAddr other_start, const VAddr other_end) const {
+ bool IsInside(VAddr other_start, VAddr other_end) const {
return cpu_addr <= other_start && other_end <= cpu_addr_end;
}
- std::size_t GetOffset(const VAddr in_addr) {
+ std::size_t Offset(VAddr in_addr) const {
return static_cast<std::size_t>(in_addr - cpu_addr);
}
- VAddr GetCpuAddr() const {
+ VAddr CpuAddr() const {
return cpu_addr;
}
- VAddr GetCpuAddrEnd() const {
+ VAddr CpuAddrEnd() const {
return cpu_addr_end;
}
- void SetCpuAddr(const VAddr new_addr) {
+ void SetCpuAddr(VAddr new_addr) {
cpu_addr = new_addr;
cpu_addr_end = new_addr + size;
}
- std::size_t GetSize() const {
+ std::size_t Size() const {
return size;
}
- void SetEpoch(u64 new_epoch) {
- epoch = new_epoch;
+ u64 Epoch() const {
+ return epoch;
}
- u64 GetEpoch() {
- return epoch;
+ void SetEpoch(u64 new_epoch) {
+ epoch = new_epoch;
}
protected:
- explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
- SetCpuAddr(cpu_addr);
+ explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+ SetCpuAddr(cpu_addr_);
}
- ~BufferBlock() = default;
private:
VAddr cpu_addr{};
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index b88fce2cd..cf8bdd021 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -30,23 +30,31 @@
namespace VideoCommon {
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
class BufferCache {
using IntervalSet = boost::icl::interval_set<VAddr>;
using IntervalType = typename IntervalSet::interval_type;
using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+ static constexpr u64 WRITE_PAGE_BIT = 11;
+ static constexpr u64 BLOCK_PAGE_BITS = 21;
+ static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
public:
- using BufferInfo = std::pair<BufferType, u64>;
+ struct BufferInfo {
+ BufferType handle;
+ u64 offset;
+ u64 address;
+ };
BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
bool is_written = false, bool use_fast_cbuf = false) {
std::lock_guard lock{mutex};
- const auto& memory_manager = system.GPU().MemoryManager();
+ auto& memory_manager = system.GPU().MemoryManager();
const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
if (!cpu_addr_opt) {
- return {GetEmptyBuffer(size), 0};
+ return GetEmptyBuffer(size);
}
const VAddr cpu_addr = *cpu_addr_opt;
@@ -55,7 +63,6 @@ public:
constexpr std::size_t max_stream_size = 0x800;
if (use_fast_cbuf || size < max_stream_size) {
if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
- auto& memory_manager = system.GPU().MemoryManager();
const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
if (use_fast_cbuf) {
u8* dest;
@@ -82,10 +89,10 @@ public:
}
}
- OwnerBuffer block = GetBlock(cpu_addr, size);
+ Buffer* const block = GetBlock(cpu_addr, size);
MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
if (!map) {
- return {GetEmptyBuffer(size), 0};
+ return GetEmptyBuffer(size);
}
if (is_written) {
map->MarkAsModified(true, GetModifiedTicks());
@@ -98,7 +105,7 @@ public:
}
}
- return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+ return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
}
/// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
@@ -110,31 +117,37 @@ public:
});
}
- void Map(std::size_t max_size) {
+ /// Prepares the buffer cache for data uploading
+ /// @param max_size Maximum number of bytes that will be uploaded
+ /// @return True when a stream buffer invalidation was required, false otherwise
+ bool Map(std::size_t max_size) {
std::lock_guard lock{mutex};
+ bool invalidated;
std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
buffer_offset = buffer_offset_base;
+
+ return invalidated;
}
- /// Finishes the upload stream, returns true on bindings invalidation.
- bool Unmap() {
+ /// Finishes the upload stream
+ void Unmap() {
std::lock_guard lock{mutex};
-
stream_buffer->Unmap(buffer_offset - buffer_offset_base);
- return std::exchange(invalidated, false);
}
+ /// Function called at the end of each frame, inteded for deferred operations
void TickFrame() {
++epoch;
+
while (!pending_destruction.empty()) {
// Delay at least 4 frames before destruction.
// This is due to triple buffering happening on some drivers.
static constexpr u64 epochs_to_destroy = 5;
- if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+ if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
break;
}
- pending_destruction.pop_front();
+ pending_destruction.pop();
}
}
@@ -245,28 +258,16 @@ public:
committed_flushes.pop_front();
}
- virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+ virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;
protected:
explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
std::unique_ptr<StreamBuffer> stream_buffer)
- : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
- stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+ : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}
~BufferCache() = default;
- virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
-
- virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
- virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) = 0;
-
- virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
- u8* data) = 0;
-
- virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) = 0;
+ virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
return {};
@@ -321,7 +322,7 @@ protected:
}
private:
- MapInterval* MapAddress(const OwnerBuffer& block, GPUVAddr gpu_addr, VAddr cpu_addr,
+ MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
std::size_t size) {
const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
if (overlaps.empty()) {
@@ -329,11 +330,11 @@ private:
const VAddr cpu_addr_end = cpu_addr + size;
if (memory_manager.IsGranularRange(gpu_addr, size)) {
u8* host_ptr = memory_manager.GetPointer(gpu_addr);
- UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+ block->Upload(block->Offset(cpu_addr), size, host_ptr);
} else {
staging_buffer.resize(size);
memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
- UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+ block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
}
return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
}
@@ -376,7 +377,7 @@ private:
return map;
}
- void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
+ void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
const VectorMapInterval& overlaps) {
const IntervalType base_interval{start, end};
IntervalSet interval_set{};
@@ -386,13 +387,13 @@ private:
interval_set.subtract(subtract);
}
for (auto& interval : interval_set) {
- std::size_t size = interval.upper() - interval.lower();
- if (size > 0) {
- staging_buffer.resize(size);
- system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
- UploadBlockData(block, block->GetOffset(interval.lower()), size,
- staging_buffer.data());
+ const std::size_t size = interval.upper() - interval.lower();
+ if (size == 0) {
+ continue;
}
+ staging_buffer.resize(size);
+ system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+ block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
}
}
@@ -422,10 +423,14 @@ private:
}
void FlushMap(MapInterval* map) {
+ const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+ ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+ std::shared_ptr<Buffer> block = it->second;
+
const std::size_t size = map->end - map->start;
- OwnerBuffer block = blocks[map->start >> block_page_bits];
staging_buffer.resize(size);
- DownloadBlockData(block, block->GetOffset(map->start), size, staging_buffer.data());
+ block->Download(block->Offset(map->start), size, staging_buffer.data());
system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
map->MarkAsModified(false, 0);
}
@@ -438,7 +443,7 @@ private:
buffer_ptr += size;
buffer_offset += size;
- return {stream_buffer_handle, uploaded_offset};
+ return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
}
void AlignBuffer(std::size_t alignment) {
@@ -448,97 +453,89 @@ private:
buffer_offset = offset_aligned;
}
- OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
- const std::size_t old_size = buffer->GetSize();
- const std::size_t new_size = old_size + block_page_size;
- const VAddr cpu_addr = buffer->GetCpuAddr();
- OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
- CopyBlock(buffer, new_buffer, 0, 0, old_size);
- buffer->SetEpoch(epoch);
- pending_destruction.push_back(buffer);
+ std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+ const std::size_t old_size = buffer->Size();
+ const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+ const VAddr cpu_addr = buffer->CpuAddr();
+ std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+ new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+ QueueDestruction(std::move(buffer));
+
const VAddr cpu_addr_end = cpu_addr + new_size - 1;
- u64 page_start = cpu_addr >> block_page_bits;
- const u64 page_end = cpu_addr_end >> block_page_bits;
- while (page_start <= page_end) {
- blocks[page_start] = new_buffer;
- ++page_start;
+ const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+ for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+ blocks.insert_or_assign(page_start, new_buffer);
}
+
return new_buffer;
}
- OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
- const std::size_t size_1 = first->GetSize();
- const std::size_t size_2 = second->GetSize();
- const VAddr first_addr = first->GetCpuAddr();
- const VAddr second_addr = second->GetCpuAddr();
+ std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+ std::shared_ptr<Buffer> second) {
+ const std::size_t size_1 = first->Size();
+ const std::size_t size_2 = second->Size();
+ const VAddr first_addr = first->CpuAddr();
+ const VAddr second_addr = second->CpuAddr();
const VAddr new_addr = std::min(first_addr, second_addr);
const std::size_t new_size = size_1 + size_2;
- OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
- CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
- CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
- first->SetEpoch(epoch);
- second->SetEpoch(epoch);
- pending_destruction.push_back(first);
- pending_destruction.push_back(second);
+
+ std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+ new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+ new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+ QueueDestruction(std::move(first));
+ QueueDestruction(std::move(second));
+
const VAddr cpu_addr_end = new_addr + new_size - 1;
- u64 page_start = new_addr >> block_page_bits;
- const u64 page_end = cpu_addr_end >> block_page_bits;
- while (page_start <= page_end) {
- blocks[page_start] = new_buffer;
- ++page_start;
+ const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+ for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+ blocks.insert_or_assign(page_start, new_buffer);
}
return new_buffer;
}
- OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
- OwnerBuffer found;
+ Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+ std::shared_ptr<Buffer> found;
+
const VAddr cpu_addr_end = cpu_addr + size - 1;
- u64 page_start = cpu_addr >> block_page_bits;
- const u64 page_end = cpu_addr_end >> block_page_bits;
- while (page_start <= page_end) {
+ const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+ for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
auto it = blocks.find(page_start);
if (it == blocks.end()) {
if (found) {
found = EnlargeBlock(found);
- } else {
- const VAddr start_addr = (page_start << block_page_bits);
- found = CreateBlock(start_addr, block_page_size);
- blocks[page_start] = found;
- }
- } else {
- if (found) {
- if (found == it->second) {
- ++page_start;
- continue;
- }
- found = MergeBlocks(found, it->second);
- } else {
- found = it->second;
+ continue;
}
+ const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+ found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+ blocks.insert_or_assign(page_start, found);
+ continue;
+ }
+ if (!found) {
+ found = it->second;
+ continue;
+ }
+ if (found != it->second) {
+ found = MergeBlocks(std::move(found), it->second);
}
- ++page_start;
}
- return found;
+ return found.get();
}
- void MarkRegionAsWritten(const VAddr start, const VAddr end) {
- u64 page_start = start >> write_page_bit;
- const u64 page_end = end >> write_page_bit;
- while (page_start <= page_end) {
+ void MarkRegionAsWritten(VAddr start, VAddr end) {
+ const u64 page_end = end >> WRITE_PAGE_BIT;
+ for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
auto it = written_pages.find(page_start);
if (it != written_pages.end()) {
it->second = it->second + 1;
} else {
- written_pages[page_start] = 1;
+ written_pages.insert_or_assign(page_start, 1);
}
- ++page_start;
}
}
- void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
- u64 page_start = start >> write_page_bit;
- const u64 page_end = end >> write_page_bit;
- while (page_start <= page_end) {
+ void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+ const u64 page_end = end >> WRITE_PAGE_BIT;
+ for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
auto it = written_pages.find(page_start);
if (it != written_pages.end()) {
if (it->second > 1) {
@@ -547,22 +544,24 @@ private:
written_pages.erase(it);
}
}
- ++page_start;
}
}
- bool IsRegionWritten(const VAddr start, const VAddr end) const {
- u64 page_start = start >> write_page_bit;
- const u64 page_end = end >> write_page_bit;
- while (page_start <= page_end) {
+ bool IsRegionWritten(VAddr start, VAddr end) const {
+ const u64 page_end = end >> WRITE_PAGE_BIT;
+ for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
if (written_pages.count(page_start) > 0) {
return true;
}
- ++page_start;
}
return false;
}
+ void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+ buffer->SetEpoch(epoch);
+ pending_destruction.push(std::move(buffer));
+ }
+
void MarkForAsyncFlush(MapInterval* map) {
if (!uncommitted_flushes) {
uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
@@ -574,9 +573,7 @@ private:
Core::System& system;
std::unique_ptr<StreamBuffer> stream_buffer;
- BufferType stream_buffer_handle{};
-
- bool invalidated = false;
+ BufferType stream_buffer_handle;
u8* buffer_ptr = nullptr;
u64 buffer_offset = 0;
@@ -586,18 +583,15 @@ private:
boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
mapped_addresses;
- static constexpr u64 write_page_bit = 11;
std::unordered_map<u64, u32> written_pages;
+ std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;
- static constexpr u64 block_page_bits = 21;
- static constexpr u64 block_page_size = 1ULL << block_page_bits;
- std::unordered_map<u64, OwnerBuffer> blocks;
-
- std::list<OwnerBuffer> pending_destruction;
+ std::queue<std::shared_ptr<Buffer>> pending_destruction;
u64 epoch = 0;
u64 modified_ticks = 0;
std::vector<u8> staging_buffer;
+
std::list<MapInterval*> marked_for_unregister;
std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp
new file mode 100644
index 000000000..6c426b035
--- /dev/null
+++ b/src/video_core/compatible_formats.cpp
@@ -0,0 +1,162 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/compatible_formats.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+namespace {
+
+// Compatibility table taken from Table 3.X.2 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt
+
+constexpr std::array VIEW_CLASS_128_BITS = {
+ PixelFormat::RGBA32F,
+ PixelFormat::RGBA32UI,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+
+constexpr std::array VIEW_CLASS_96_BITS = {
+ PixelFormat::RGB32F,
+};
+// Missing formats:
+// PixelFormat::RGB32UI,
+// PixelFormat::RGB32I,
+
+constexpr std::array VIEW_CLASS_64_BITS = {
+ PixelFormat::RGBA16F, PixelFormat::RG32F, PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+ PixelFormat::RGBA16U, PixelFormat::RGBA16F, PixelFormat::RGBA16S,
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I
+
+// TODO: How should we handle 48 bits?
+
+constexpr std::array VIEW_CLASS_32_BITS = {
+ PixelFormat::RG16F, PixelFormat::R11FG11FB10F, PixelFormat::R32F,
+ PixelFormat::A2B10G10R10U, PixelFormat::RG16UI, PixelFormat::R32UI,
+ PixelFormat::RG16I, PixelFormat::R32I, PixelFormat::ABGR8U,
+ PixelFormat::RG16, PixelFormat::ABGR8S, PixelFormat::RG16S,
+ PixelFormat::RGBA8_SRGB, PixelFormat::E5B9G9R9F, PixelFormat::BGRA8,
+ PixelFormat::BGRA8_SRGB,
+};
+// Missing formats:
+// PixelFormat::RGBA8UI
+// PixelFormat::RGBA8I
+// PixelFormat::RGB10_A2_UI
+
+// TODO: How should we handle 24 bits?
+
+constexpr std::array VIEW_CLASS_16_BITS = {
+ PixelFormat::R16F, PixelFormat::RG8UI, PixelFormat::R16UI, PixelFormat::R16I,
+ PixelFormat::RG8U, PixelFormat::R16U, PixelFormat::RG8S, PixelFormat::R16S,
+};
+// Missing formats:
+// PixelFormat::RG8I
+
+constexpr std::array VIEW_CLASS_8_BITS = {
+ PixelFormat::R8UI,
+ PixelFormat::R8U,
+};
+// Missing formats:
+// PixelFormat::R8I
+// PixelFormat::R8S
+
+constexpr std::array VIEW_CLASS_RGTC1_RED = {
+ PixelFormat::DXN1,
+};
+// Missing formats:
+// COMPRESSED_SIGNED_RED_RGTC1
+
+constexpr std::array VIEW_CLASS_RGTC2_RG = {
+ PixelFormat::DXN2UNORM,
+ PixelFormat::DXN2SNORM,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+ PixelFormat::BC7U,
+ PixelFormat::BC7U_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+ PixelFormat::BC6H_SF16,
+ PixelFormat::BC6H_UF16,
+};
+
+// Compatibility table taken from Table 4.X.1 in:
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt
+
+constexpr std::array COPY_CLASS_128_BITS = {
+ PixelFormat::RGBA32UI, PixelFormat::RGBA32F, PixelFormat::DXT23,
+ PixelFormat::DXT23_SRGB, PixelFormat::DXT45, PixelFormat::DXT45_SRGB,
+ PixelFormat::DXN2SNORM, PixelFormat::BC7U, PixelFormat::BC7U_SRGB,
+ PixelFormat::BC6H_SF16, PixelFormat::BC6H_UF16,
+};
+// Missing formats:
+// PixelFormat::RGBA32I
+// COMPRESSED_RG_RGTC2
+
+constexpr std::array COPY_CLASS_64_BITS = {
+ PixelFormat::RGBA16F, PixelFormat::RG32F, PixelFormat::RGBA16UI, PixelFormat::RG32UI,
+ PixelFormat::RGBA16U, PixelFormat::RGBA16S, PixelFormat::DXT1_SRGB, PixelFormat::DXT1,
+
+};
+// Missing formats:
+// PixelFormat::RGBA16I
+// PixelFormat::RG32I,
+// COMPRESSED_RGB_S3TC_DXT1_EXT
+// COMPRESSED_SRGB_S3TC_DXT1_EXT
+// COMPRESSED_RGBA_S3TC_DXT1_EXT
+// COMPRESSED_SIGNED_RED_RGTC1
+
+void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
+ compatiblity[format_a][format_b] = true;
+ compatiblity[format_b][format_a] = true;
+}
+
+void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
+ Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+}
+
+template <typename Range>
+void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+ for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
+ for (auto it_b = it_a; it_b != range.end(); ++it_b) {
+ Enable(compatibility, *it_a, *it_b);
+ }
+ }
+}
+
+} // Anonymous namespace
+
+FormatCompatibility::FormatCompatibility() {
+ for (size_t i = 0; i < MaxPixelFormat; ++i) {
+ // Identity is allowed
+ Enable(view, i, i);
+ }
+
+ EnableRange(view, VIEW_CLASS_128_BITS);
+ EnableRange(view, VIEW_CLASS_96_BITS);
+ EnableRange(view, VIEW_CLASS_64_BITS);
+ EnableRange(view, VIEW_CLASS_32_BITS);
+ EnableRange(view, VIEW_CLASS_16_BITS);
+ EnableRange(view, VIEW_CLASS_8_BITS);
+ EnableRange(view, VIEW_CLASS_RGTC1_RED);
+ EnableRange(view, VIEW_CLASS_RGTC2_RG);
+ EnableRange(view, VIEW_CLASS_BPTC_UNORM);
+ EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+
+ copy = view;
+ EnableRange(copy, COPY_CLASS_128_BITS);
+ EnableRange(copy, COPY_CLASS_64_BITS);
+}
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h
new file mode 100644
index 000000000..d1082566d
--- /dev/null
+++ b/src/video_core/compatible_formats.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <bitset>
+#include <cstddef>
+
+#include "video_core/surface.h"
+
+namespace VideoCore::Surface {
+
+class FormatCompatibility {
+public:
+ using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+
+ explicit FormatCompatibility();
+
+ bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
+ return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+ }
+
+ bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
+ return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
+ }
+
+private:
+ Table view;
+ Table copy;
+};
+
+} // namespace VideoCore::Surface
diff --git a/src/video_core/engines/const_buffer_engine_interface.h b/src/video_core/engines/const_buffer_engine_interface.h
index ebe139504..f46e81bb7 100644
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const = 0;
+ virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
virtual u32 GetBoundBuffer() const = 0;
virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index f6237fc6a..a82b06a38 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
ASSERT(stage == ShaderType::Compute);
const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+ return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
- const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+ const Texture::TextureHandle tex_handle{handle};
const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 18ceedfaf..b7f668d88 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const override;
+ SamplerDescriptor AccessSampler(u32 handle) const override;
+
u32 GetBoundBuffer() const override {
return regs.tex_cb_index;
}
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index e46b153f9..c01436295 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -128,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)
((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());
// Execute the current macro.
- macro_engine->Execute(macro_positions[entry], parameters);
+ macro_engine->Execute(*this, macro_positions[entry], parameters);
if (mme_draw.current_mode != MMEDrawMode::Undefined) {
FlushMMEInlineDraw();
}
@@ -740,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
const auto& tex_info_buffer = shader.const_buffers[const_buffer];
const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+ return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}
- const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+ const Texture::TextureHandle tex_handle{handle};
const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index b827b112f..ef1618990 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -598,6 +598,7 @@ public:
BitField<4, 3, u32> block_height;
BitField<8, 3, u32> block_depth;
BitField<12, 1, InvMemoryLayout> type;
+ BitField<16, 1, u32> is_3d;
} memory_layout;
union {
BitField<0, 16, u32> layers;
@@ -1403,6 +1404,8 @@ public:
SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
u64 offset) const override;
+ SamplerDescriptor AccessSampler(u32 handle) const override;
+
u32 GetBoundBuffer() const override {
return regs.tex_cb_index;
}
@@ -1415,6 +1418,14 @@ public:
return execute_on;
}
+ VideoCore::RasterizerInterface& GetRasterizer() {
+ return rasterizer;
+ }
+
+ const VideoCore::RasterizerInterface& GetRasterizer() const {
+ return rasterizer;
+ }
+
/// Notify a memory write has happened.
void OnMemoryWrite() {
dirty.flags |= dirty.on_write_stores;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index e7cb87589..d374b73cf 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -661,6 +661,10 @@ union Instruction {
constexpr Instruction(u64 value) : value{value} {}
constexpr Instruction(const Instruction& instr) : value(instr.value) {}
+ constexpr bool Bit(u64 offset) const {
+ return ((value >> offset) & 1) != 0;
+ }
+
BitField<0, 8, Register> gpr0;
BitField<8, 8, Register> gpr8;
union {
@@ -1874,7 +1878,9 @@ public:
HSETP2_C,
HSETP2_R,
HSETP2_IMM,
+ HSET2_C,
HSET2_R,
+ HSET2_IMM,
POPC_C,
POPC_R,
POPC_IMM,
@@ -2194,7 +2200,9 @@ private:
INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
+ INST("0111110-1-------", Id::HSET2_C, Type::HalfSet, "HSET2_C"),
INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
+ INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"),
INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"),
INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"),
INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 8eb017f65..482e49711 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -2,6 +2,8 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <chrono>
+
#include "common/assert.h"
#include "common/microprofile.h"
#include "core/core.h"
@@ -154,8 +156,7 @@ u64 GPU::GetTicks() const {
constexpr u64 gpu_ticks_num = 384;
constexpr u64 gpu_ticks_den = 625;
- const u64 cpu_ticks = system.CoreTiming().GetTicks();
- u64 nanoseconds = Core::Timing::CyclesToNs(cpu_ticks).count();
+ u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count();
if (Settings::values.use_fast_gpu_time) {
nanoseconds /= 256;
}
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index a1b4c305c..2c42483bd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -284,6 +284,12 @@ public:
/// core timing events.
virtual void Start() = 0;
+ /// Obtain the CPU Context
+ virtual void ObtainContext() = 0;
+
+ /// Release the CPU Context
+ virtual void ReleaseContext() = 0;
+
/// Push GPU command entries to be processed
virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 53305ab43..7b855f63e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -19,10 +19,17 @@ GPUAsynch::GPUAsynch(Core::System& system, std::unique_ptr<VideoCore::RendererBa
GPUAsynch::~GPUAsynch() = default;
void GPUAsynch::Start() {
- cpu_context->MakeCurrent();
gpu_thread.StartThread(*renderer, *gpu_context, *dma_pusher);
}
+void GPUAsynch::ObtainContext() {
+ cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+ cpu_context->DoneCurrent();
+}
+
void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
gpu_thread.SubmitList(std::move(entries));
}
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 517658612..15e9f1d38 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -25,6 +25,8 @@ public:
~GPUAsynch() override;
void Start() override;
+ void ObtainContext() override;
+ void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 6f38a672a..aaeb9811d 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -13,10 +13,16 @@ GPUSynch::GPUSynch(Core::System& system, std::unique_ptr<VideoCore::RendererBase
GPUSynch::~GPUSynch() = default;
-void GPUSynch::Start() {
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
context->MakeCurrent();
}
+void GPUSynch::ReleaseContext() {
+ context->DoneCurrent();
+}
+
void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
dma_pusher->Push(std::move(entries));
dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 4a6e9a01d..762c20aa5 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -24,6 +24,8 @@ public:
~GPUSynch() override;
void Start() override;
+ void ObtainContext() override;
+ void ReleaseContext() override;
void PushGPUEntries(Tegra::CommandList&& entries) override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
void FlushRegion(VAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index c3bb4fe06..738c6f0c1 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,6 +4,7 @@
#include "common/assert.h"
#include "common/microprofile.h"
+#include "common/thread.h"
#include "core/core.h"
#include "core/frontend/emu_window.h"
#include "core/settings.h"
@@ -18,7 +19,11 @@ namespace VideoCommon::GPUThread {
static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher,
SynchState& state) {
- MicroProfileOnThreadCreate("GpuThread");
+ std::string name = "yuzu:GPU";
+ MicroProfileOnThreadCreate(name.c_str());
+ Common::SetCurrentThreadName(name.c_str());
+ Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+ system.RegisterHostThread();
// Wait for first GPU command before acquiring the window context
while (state.queue.Empty())
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
index 89077a2d8..a50e7b4e0 100644
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,32 +2,78 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
+#include <optional>
+#include <boost/container_hash/hash.hpp>
#include "common/assert.h"
#include "common/logging/log.h"
#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
#include "video_core/macro/macro_interpreter.h"
#include "video_core/macro/macro_jit_x64.h"
namespace Tegra {
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+ : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
void MacroEngine::AddCode(u32 method, u32 data) {
uploaded_macro_code[method].push_back(data);
}
-void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+ const std::vector<u32>& parameters) {
auto compiled_macro = macro_cache.find(method);
if (compiled_macro != macro_cache.end()) {
- compiled_macro->second->Execute(parameters, method);
+ const auto& cache_info = compiled_macro->second;
+ if (cache_info.has_hle_program) {
+ cache_info.hle_program->Execute(parameters, method);
+ } else {
+ cache_info.lle_program->Execute(parameters, method);
+ }
} else {
// Macro not compiled, check if it's uploaded and if so, compile it
- auto macro_code = uploaded_macro_code.find(method);
+ std::optional<u32> mid_method = std::nullopt;
+ const auto macro_code = uploaded_macro_code.find(method);
if (macro_code == uploaded_macro_code.end()) {
- UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
- return;
+ for (const auto& [method_base, code] : uploaded_macro_code) {
+ if (method >= method_base && (method - method_base) < code.size()) {
+ mid_method = method_base;
+ break;
+ }
+ }
+ if (!mid_method.has_value()) {
+ UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+ return;
+ }
+ }
+ auto& cache_info = macro_cache[method];
+
+ if (!mid_method.has_value()) {
+ cache_info.lle_program = Compile(macro_code->second);
+ cache_info.hash = boost::hash_value(macro_code->second);
+ } else {
+ const auto& macro_cached = uploaded_macro_code[mid_method.value()];
+ const auto rebased_method = method - mid_method.value();
+ auto& code = uploaded_macro_code[method];
+ code.resize(macro_cached.size() - rebased_method);
+ std::memcpy(code.data(), macro_cached.data() + rebased_method,
+ code.size() * sizeof(u32));
+ cache_info.hash = boost::hash_value(code);
+ cache_info.lle_program = Compile(code);
+ }
+
+ auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+ if (hle_program.has_value()) {
+ cache_info.has_hle_program = true;
+ cache_info.hle_program = std::move(hle_program.value());
+ cache_info.hle_program->Execute(parameters, method);
+ } else {
+ cache_info.lle_program->Execute(parameters, method);
}
- macro_cache[method] = Compile(macro_code->second);
- macro_cache[method]->Execute(parameters, method);
}
}
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
index b76ed891f..4d00b84b0 100644
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -11,9 +11,11 @@
#include "common/common_types.h"
namespace Tegra {
+
namespace Engines {
class Maxwell3D;
}
+
namespace Macro {
constexpr std::size_t NUM_MACRO_REGISTERS = 8;
enum class Operation : u32 {
@@ -94,6 +96,8 @@ union MethodAddress {
} // namespace Macro
+class HLEMacro;
+
class CachedMacro {
public:
virtual ~CachedMacro() = default;
@@ -107,20 +111,29 @@ public:
class MacroEngine {
public:
- virtual ~MacroEngine() = default;
+ explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+ virtual ~MacroEngine();
// Store the uploaded macro code to compile them when they're called.
void AddCode(u32 method, u32 data);
// Compiles the macro if its not in the cache, and executes the compiled macro
- void Execute(u32 method, const std::vector<u32>& parameters);
+ void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
protected:
virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
private:
- std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache;
+ struct CacheInfo {
+ std::unique_ptr<CachedMacro> lle_program{};
+ std::unique_ptr<CachedMacro> hle_program{};
+ u64 hash{};
+ bool has_hle_program{};
+ };
+
+ std::unordered_map<u32, CacheInfo> macro_cache;
std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+ std::unique_ptr<HLEMacro> hle_macros;
};
std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..410f99018
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+ const std::vector<u32>& parameters) {
+ const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+ maxwell3d.regs.draw.topology.Assign(
+ static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+ ~(0x3ffffff << 26)));
+ maxwell3d.regs.vb_base_instance = parameters[5];
+ maxwell3d.mme_draw.instance_count = instance_count;
+ maxwell3d.regs.vb_element_base = parameters[3];
+ maxwell3d.regs.index_array.count = parameters[1];
+ maxwell3d.regs.index_array.first = parameters[4];
+
+ if (maxwell3d.ShouldExecute()) {
+ maxwell3d.GetRasterizer().Draw(true, true);
+ }
+ maxwell3d.regs.index_array.count = 0;
+ maxwell3d.mme_draw.instance_count = 0;
+ maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+ const std::vector<u32>& parameters) {
+ const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+ maxwell3d.regs.vertex_buffer.first = parameters[3];
+ maxwell3d.regs.vertex_buffer.count = parameters[1];
+ maxwell3d.regs.vb_base_instance = parameters[4];
+ maxwell3d.regs.draw.topology.Assign(
+ static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+ maxwell3d.mme_draw.instance_count = count;
+
+ if (maxwell3d.ShouldExecute()) {
+ maxwell3d.GetRasterizer().Draw(false, true);
+ }
+ maxwell3d.regs.vertex_buffer.count = 0;
+ maxwell3d.mme_draw.instance_count = 0;
+ maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+ const std::vector<u32>& parameters) {
+ const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+ const u32 element_base = parameters[4];
+ const u32 base_instance = parameters[5];
+ maxwell3d.regs.index_array.first = parameters[3];
+ maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+ maxwell3d.regs.index_array.count = parameters[1];
+ maxwell3d.regs.vb_element_base = element_base;
+ maxwell3d.regs.vb_base_instance = base_instance;
+ maxwell3d.mme_draw.instance_count = instance_count;
+ maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+ maxwell3d.CallMethodFromMME(0x8e4, element_base);
+ maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+ maxwell3d.regs.draw.topology.Assign(
+ static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+ if (maxwell3d.ShouldExecute()) {
+ maxwell3d.GetRasterizer().Draw(true, true);
+ }
+ maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+ maxwell3d.regs.index_array.count = 0;
+ maxwell3d.regs.vb_element_base = 0x0;
+ maxwell3d.regs.vb_base_instance = 0x0;
+ maxwell3d.mme_draw.instance_count = 0;
+ maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+ maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+ maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+ maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+ std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+ std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+ std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+ const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+ [hash](const auto& pair) { return pair.first == hash; });
+ if (it == hle_funcs.end()) {
+ return std::nullopt;
+ }
+ return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+ : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+ func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+ explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+ ~HLEMacro();
+
+ std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+ Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+ explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+ ~HLEMacroImpl();
+
+ void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+ Engines::Maxwell3D& maxwell3d;
+ HLEFunction func;
+};
+
+} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
index 5edff27aa..aa5256419 100644
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -11,7 +11,8 @@
MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+ : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
index 11c1cc3be..07292702f 100644
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -14,27 +14,22 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255
MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
namespace Tegra {
-static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9;
-static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10;
-static const Xbyak::Reg64 STATE = Xbyak::util::r11;
-static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12;
-static const Xbyak::Reg32 RESULT = Xbyak::util::r13d;
-static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13;
+static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
+static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
-static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14;
static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
- PARAMETERS,
- REGISTERS,
STATE,
- NEXT_PARAMETER,
RESULT,
+ PARAMETERS,
METHOD_ADDRESS,
BRANCH_HOLDER,
});
-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+ : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
@@ -53,32 +48,32 @@ void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
JITState state{};
state.maxwell3d = &maxwell3d;
state.registers = {};
- state.parameters = parameters.data();
- program(&state);
+ program(&state, parameters.data());
}
void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
const bool is_a_zero = opcode.src_a == 0;
const bool is_b_zero = opcode.src_b == 0;
const bool valid_operation = !is_a_zero && !is_b_zero;
- const bool is_move_operation = !is_a_zero && is_b_zero;
+ [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
const bool has_zero_register = is_a_zero || is_b_zero;
+ const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+ opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
- Xbyak::Reg64 src_a;
+ Xbyak::Reg32 src_a;
Xbyak::Reg32 src_b;
- if (!optimizer.zero_reg_skip) {
- src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
- src_b = Compile_GetRegister(opcode.src_b, ebx);
+ if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+ src_a = Compile_GetRegister(opcode.src_a, RESULT);
+ src_b = Compile_GetRegister(opcode.src_b, eax);
} else {
if (!is_a_zero) {
- src_a = Compile_GetRegister(opcode.src_a, RESULT_64);
+ src_a = Compile_GetRegister(opcode.src_a, RESULT);
}
if (!is_b_zero) {
- src_b = Compile_GetRegister(opcode.src_b, ebx);
+ src_b = Compile_GetRegister(opcode.src_b, eax);
}
}
- Xbyak::Label skip_carry{};
bool has_emitted = false;
@@ -190,7 +185,8 @@ void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
if (next_opcode.has_value()) {
const auto next = *next_opcode;
- if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+ if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+ opcode.dst == next.dst) {
return;
}
}
@@ -244,10 +240,10 @@ void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
}
void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
- auto dst = Compile_GetRegister(opcode.src_a, eax);
- auto src = Compile_GetRegister(opcode.src_b, RESULT);
+ const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+ const auto src = Compile_GetRegister(opcode.src_b, RESULT);
- shr(src, al);
+ shr(src, dst.cvt8());
if (opcode.bf_size != 0 && opcode.bf_size != 31) {
and_(src, opcode.GetBitfieldMask());
} else if (opcode.bf_size == 0) {
@@ -263,8 +259,8 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
}
void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
- auto dst = Compile_GetRegister(opcode.src_a, eax);
- auto src = Compile_GetRegister(opcode.src_b, RESULT);
+ const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+ const auto src = Compile_GetRegister(opcode.src_b, RESULT);
if (opcode.bf_src_bit != 0) {
shr(src, opcode.bf_src_bit);
@@ -273,16 +269,9 @@ void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
if (opcode.bf_size != 31) {
and_(src, opcode.GetBitfieldMask());
}
- shl(src, al);
- Compile_ProcessResult(opcode.result_operation, opcode.dst);
-}
+ shl(src, dst.cvt8());
-static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) {
- return maxwell3d->GetRegisterValue(method);
-}
-
-static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
- maxwell3d->CallMethodFromMME(method_address.address, value);
+ Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
@@ -302,22 +291,34 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
sub(result, opcode.immediate * -1);
}
}
- Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
- mov(Common::X64::ABI_PARAM1, qword[STATE]);
- mov(Common::X64::ABI_PARAM2, RESULT);
- Common::X64::CallFarFunction(*this, &Read);
- Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
- mov(RESULT, Common::X64::ABI_RETURN.cvt32());
+
+ // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+ if (optimizer.enable_asserts) {
+ Xbyak::Label pass_range_check;
+ cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+ jb(pass_range_check);
+ int3();
+ L(pass_range_check);
+ }
+ mov(rax, qword[STATE]);
+ mov(RESULT,
+ dword[rax + offsetof(Engines::Maxwell3D, regs) +
+ offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
Compile_ProcessResult(opcode.result_operation, opcode.dst);
}
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+ maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
- Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+ Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
mov(Common::X64::ABI_PARAM1, qword[STATE]);
mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
mov(Common::X64::ABI_PARAM3, value);
Common::X64::CallFarFunction(*this, &Send);
- Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0);
+ Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
Xbyak::Label dont_process{};
// Get increment
@@ -329,7 +330,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
and_(METHOD_ADDRESS, 0xfff);
shr(ecx, 12);
and_(ecx, 0x3f);
- lea(eax, ptr[rcx + METHOD_ADDRESS_64]);
+ lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
sal(ecx, 12);
or_(eax, ecx);
@@ -421,19 +422,15 @@ void MacroJITx64Impl::Compile() {
bool keep_executing = true;
labels.fill(Xbyak::Label());
- Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+ Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
// JIT state
mov(STATE, Common::X64::ABI_PARAM1);
- mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 +
- static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]);
- mov(REGISTERS, Common::X64::ABI_PARAM1);
- add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers)));
+ mov(PARAMETERS, Common::X64::ABI_PARAM2);
xor_(RESULT, RESULT);
xor_(METHOD_ADDRESS, METHOD_ADDRESS);
- xor_(NEXT_PARAMETER, NEXT_PARAMETER);
xor_(BRANCH_HOLDER, BRANCH_HOLDER);
- mov(dword[REGISTERS + 4], Compile_FetchParameter());
+ mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
// Track get register for zero registers and mark it as no-op
optimizer.zero_reg_skip = true;
@@ -446,6 +443,9 @@ void MacroJITx64Impl::Compile() {
// one if our register isn't "dirty"
optimizer.optimize_for_method_move = true;
+ // Enable run-time assertions in JITted code
+ optimizer.enable_asserts = false;
+
// Check to see if we can skip emitting certain instructions
Optimizer_ScanFlags();
@@ -463,7 +463,7 @@ void MacroJITx64Impl::Compile() {
L(end_of_code);
- Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+ Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
ret();
ready();
program = getCode<ProgramType>();
@@ -537,8 +537,8 @@ bool MacroJITx64Impl::Compile_NextInstruction() {
}
Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
- mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]);
- inc(NEXT_PARAMETER);
+ mov(eax, dword[PARAMETERS]);
+ add(PARAMETERS, sizeof(u32));
return eax;
}
@@ -547,41 +547,22 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
// Register 0 is always zero
xor_(dst, dst);
} else {
- mov(dst, dword[REGISTERS + index * sizeof(u32)]);
- }
-
- return dst;
-}
-
-Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) {
- if (index == 0) {
- // Register 0 is always zero
- xor_(dst, dst);
- } else {
- mov(dst, dword[REGISTERS + index * sizeof(u32)]);
+ mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
}
return dst;
}
-void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) {
- Xbyak::Label zero{}, end{};
- xor_(ecx, ecx);
- shr(dst, 32);
- setne(cl);
- mov(dword[STATE + offsetof(JITState, carry_flag)], ecx);
-}
-
void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
- auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) {
+ const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
// Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
// register.
if (reg == 0) {
return;
}
- mov(dword[REGISTERS + reg * sizeof(u32)], result);
+ mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
};
- auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); };
+ const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
switch (operation) {
case Macro::ResultOperation::IgnoreAndFetch:
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
index 21ee157cf..a180e7428 100644
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -55,8 +55,6 @@ private:
Xbyak::Reg32 Compile_FetchParameter();
Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
- Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst);
- void Compile_WriteCarry(Xbyak::Reg64 dst);
void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
void Compile_Send(Xbyak::Reg32 value);
@@ -67,11 +65,10 @@ private:
struct JITState {
Engines::Maxwell3D* maxwell3d{};
std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
- const u32* parameters{};
u32 carry_flag{};
};
static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
- using ProgramType = void (*)(JITState*);
+ using ProgramType = void (*)(JITState*, const u32*);
struct OptimizerState {
bool can_skip_carry{};
@@ -79,14 +76,15 @@ private:
bool zero_reg_skip{};
bool skip_dummy_addimmediate{};
bool optimize_for_method_move{};
+ bool enable_asserts{};
};
OptimizerState optimizer{};
std::optional<Macro::Opcode> next_opcode{};
ProgramType program{nullptr};
- std::array<Xbyak::Label, MAX_CODE_SIZE> labels{};
- std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{};
+ std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+ std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
Xbyak::Label end_of_code{};
bool is_delay_slot{};
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index dbee9f634..ff5505d12 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si
return range == inner_size;
}
-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer,
+ const std::size_t size) const {
std::size_t remaining_size{size};
- std::size_t page_index{src_addr >> page_bits};
- std::size_t page_offset{src_addr & page_mask};
+ std::size_t page_index{gpu_src_addr >> page_bits};
+ std::size_t page_offset{gpu_src_addr & page_mask};
auto& memory = system.Memory();
@@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
}
}
-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
const std::size_t size) const {
std::size_t remaining_size{size};
- std::size_t page_index{src_addr >> page_bits};
- std::size_t page_offset{src_addr & page_mask};
+ std::size_t page_index{gpu_src_addr >> page_bits};
+ std::size_t page_offset{gpu_src_addr & page_mask};
auto& memory = system.Memory();
@@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
}
}
-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer,
+ const std::size_t size) {
std::size_t remaining_size{size};
- std::size_t page_index{dest_addr >> page_bits};
- std::size_t page_offset{dest_addr & page_mask};
+ std::size_t page_index{gpu_dest_addr >> page_bits};
+ std::size_t page_offset{gpu_dest_addr & page_mask};
auto& memory = system.Memory();
@@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
}
}
-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
const std::size_t size) {
std::size_t remaining_size{size};
- std::size_t page_index{dest_addr >> page_bits};
- std::size_t page_offset{dest_addr & page_mask};
+ std::size_t page_index{gpu_dest_addr >> page_bits};
+ std::size_t page_offset{gpu_dest_addr & page_mask};
auto& memory = system.Memory();
@@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
}
}
-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+ const std::size_t size) {
std::vector<u8> tmp_buffer(size);
- ReadBlock(src_addr, tmp_buffer.data(), size);
- WriteBlock(dest_addr, tmp_buffer.data(), size);
+ ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+ WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
}
-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+ const std::size_t size) {
std::vector<u8> tmp_buffer(size);
- ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
- WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+ ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+ WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
}
bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 0ddd52d5a..87658e87a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -79,9 +79,9 @@ public:
* in the Host Memory counterpart. Note: This functions cause Host GPU Memory
* Flushes and Invalidations, respectively to each operation.
*/
- void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
- void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
- void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+ void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+ void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+ void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
/**
* ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,9 +93,9 @@ public:
* WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
* being flushed.
*/
- void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
- void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
- void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+ void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+ void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+ void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);
/**
* IsGranularRange checks if a gpu region can be simply read with a pointer
diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h
index 2f75f8801..e12dab899 100644
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -220,8 +220,8 @@ private:
return cache_begin < addr_end && addr_begin < cache_end;
};
- const u64 page_end = addr_end >> PAGE_SHIFT;
- for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+ const u64 page_end = addr_end >> PAGE_BITS;
+ for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
const auto& it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
continue;
@@ -242,14 +242,14 @@ private:
/// Registers the passed parameters as cached and returns a pointer to the stored cached query.
CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
- const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT;
+ const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
host_ptr);
}
/// Tries to a get a cached query. Returns nullptr on failure.
CachedQuery* TryGet(VAddr addr) {
- const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+ const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
const auto it = cached_queries.find(page);
if (it == std::end(cached_queries)) {
return nullptr;
@@ -268,7 +268,7 @@ private:
}
static constexpr std::uintptr_t PAGE_SIZE = 4096;
- static constexpr unsigned PAGE_SHIFT = 12;
+ static constexpr unsigned PAGE_BITS = 12;
Core::System& system;
VideoCore::RasterizerInterface& rasterizer;
diff --git a/src/video_core/rasterizer_cache.cpp b/src/video_core/rasterizer_cache.cpp
deleted file mode 100644
index 093b2cdf4..000000000
--- a/src/video_core/rasterizer_cache.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/rasterizer_cache.h"
-
-RasterizerCacheObject::~RasterizerCacheObject() = default;
diff --git a/src/video_core/rasterizer_cache.h b/src/video_core/rasterizer_cache.h
deleted file mode 100644
index 096ee337c..000000000
--- a/src/video_core/rasterizer_cache.h
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-
-#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range_core.hpp>
-
-#include "common/common_types.h"
-#include "core/settings.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
-
-class RasterizerCacheObject {
-public:
- explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
-
- virtual ~RasterizerCacheObject();
-
- VAddr GetCpuAddr() const {
- return cpu_addr;
- }
-
- /// Gets the size of the shader in guest memory, required for cache management
- virtual std::size_t GetSizeInBytes() const = 0;
-
- /// Sets whether the cached object should be considered registered
- void SetIsRegistered(bool registered) {
- is_registered = registered;
- }
-
- /// Returns true if the cached object is registered
- bool IsRegistered() const {
- return is_registered;
- }
-
- /// Returns true if the cached object is dirty
- bool IsDirty() const {
- return is_dirty;
- }
-
- /// Returns ticks from when this cached object was last modified
- u64 GetLastModifiedTicks() const {
- return last_modified_ticks;
- }
-
- /// Marks an object as recently modified, used to specify whether it is clean or dirty
- template <class T>
- void MarkAsModified(bool dirty, T& cache) {
- is_dirty = dirty;
- last_modified_ticks = cache.GetModifiedTicks();
- }
-
- void SetMemoryMarked(bool is_memory_marked_) {
- is_memory_marked = is_memory_marked_;
- }
-
- bool IsMemoryMarked() const {
- return is_memory_marked;
- }
-
- void SetSyncPending(bool is_sync_pending_) {
- is_sync_pending = is_sync_pending_;
- }
-
- bool IsSyncPending() const {
- return is_sync_pending;
- }
-
-private:
- bool is_registered{}; ///< Whether the object is currently registered with the cache
- bool is_dirty{}; ///< Whether the object is dirty (out of sync with guest memory)
- bool is_memory_marked{}; ///< Whether the object is marking rasterizer memory.
- bool is_sync_pending{}; ///< Whether the object is pending deletion.
- u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
- VAddr cpu_addr{}; ///< Cpu address memory, unique from emulated virtual address space
-};
-
-template <class T>
-class RasterizerCache : NonCopyable {
- friend class RasterizerCacheObject;
-
-public:
- explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
-
- /// Write any cached resources overlapping the specified region back to memory
- void FlushRegion(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
-
- const auto& objects{GetSortedObjectsFromRegion(addr, size)};
- for (auto& object : objects) {
- FlushObject(object);
- }
- }
-
- /// Mark the specified region as being invalidated
- void InvalidateRegion(VAddr addr, u64 size) {
- std::lock_guard lock{mutex};
-
- const auto& objects{GetSortedObjectsFromRegion(addr, size)};
- for (auto& object : objects) {
- if (!object->IsRegistered()) {
- // Skip duplicates
- continue;
- }
- Unregister(object);
- }
- }
-
- void OnCPUWrite(VAddr addr, std::size_t size) {
- std::lock_guard lock{mutex};
-
- for (const auto& object : GetSortedObjectsFromRegion(addr, size)) {
- if (object->IsRegistered()) {
- UnmarkMemory(object);
- object->SetSyncPending(true);
- marked_for_unregister.emplace_back(object);
- }
- }
- }
-
- void SyncGuestHost() {
- std::lock_guard lock{mutex};
-
- for (const auto& object : marked_for_unregister) {
- if (object->IsRegistered()) {
- object->SetSyncPending(false);
- Unregister(object);
- }
- }
- marked_for_unregister.clear();
- }
-
- /// Invalidates everything in the cache
- void InvalidateAll() {
- std::lock_guard lock{mutex};
-
- while (interval_cache.begin() != interval_cache.end()) {
- Unregister(*interval_cache.begin()->second.begin());
- }
- }
-
-protected:
- /// Tries to get an object from the cache with the specified cache address
- T TryGet(VAddr addr) const {
- const auto iter = map_cache.find(addr);
- if (iter != map_cache.end())
- return iter->second;
- return nullptr;
- }
-
- /// Register an object into the cache
- virtual void Register(const T& object) {
- std::lock_guard lock{mutex};
-
- object->SetIsRegistered(true);
- interval_cache.add({GetInterval(object), ObjectSet{object}});
- map_cache.insert({object->GetCpuAddr(), object});
- rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
- object->SetMemoryMarked(true);
- }
-
- /// Unregisters an object from the cache
- virtual void Unregister(const T& object) {
- std::lock_guard lock{mutex};
-
- UnmarkMemory(object);
- object->SetIsRegistered(false);
- if (object->IsSyncPending()) {
- marked_for_unregister.remove(object);
- object->SetSyncPending(false);
- }
- const VAddr addr = object->GetCpuAddr();
- interval_cache.subtract({GetInterval(object), ObjectSet{object}});
- map_cache.erase(addr);
- }
-
- void UnmarkMemory(const T& object) {
- if (!object->IsMemoryMarked()) {
- return;
- }
- rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
- object->SetMemoryMarked(false);
- }
-
- /// Returns a ticks counter used for tracking when cached objects were last modified
- u64 GetModifiedTicks() {
- std::lock_guard lock{mutex};
-
- return ++modified_ticks;
- }
-
- virtual void FlushObjectInner(const T& object) = 0;
-
- /// Flushes the specified object, updating appropriate cache state as needed
- void FlushObject(const T& object) {
- std::lock_guard lock{mutex};
-
- if (!object->IsDirty()) {
- return;
- }
- FlushObjectInner(object);
- object->MarkAsModified(false, *this);
- }
-
- std::recursive_mutex mutex;
-
-private:
- /// Returns a list of cached objects from the specified memory region, ordered by access time
- std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
- if (size == 0) {
- return {};
- }
-
- std::vector<T> objects;
- const ObjectInterval interval{addr, addr + size};
- for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
- for (auto& cached_object : pair.second) {
- if (!cached_object) {
- continue;
- }
- objects.push_back(cached_object);
- }
- }
-
- std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
- return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
- });
-
- return objects;
- }
-
- using ObjectSet = std::set<T>;
- using ObjectCache = std::unordered_map<VAddr, T>;
- using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
- using ObjectInterval = typename IntervalCache::interval_type;
-
- static auto GetInterval(const T& object) {
- return ObjectInterval::right_open(object->GetCpuAddr(),
- object->GetCpuAddr() + object->GetSizeInBytes());
- }
-
- ObjectCache map_cache;
- IntervalCache interval_cache; ///< Cache of objects
- u64 modified_ticks{}; ///< Counter of cache state ticks, used for in-order flushing
- VideoCore::RasterizerInterface& rasterizer;
- std::list<T> marked_for_unregister;
-};
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
new file mode 100644
index 000000000..eb5158407
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -0,0 +1,2073 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+
+#include <fmt/format.h>
+
+#include "common/alignment.h"
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/shader_ir.h"
+
+// Predicates in the decompiled code follow the convention that -1 means true and 0 means false.
+// GLASM lacks booleans, so they have to be implemented as integers.
+// Using -1 for true is useful because both CMP.S and NOT.U can negate it, and CMP.S can be used to
+// select between two values, because -1 will be evaluated as true and 0 as false.
+
+namespace OpenGL {
+
+namespace {
+
+using Tegra::Engines::ShaderType;
+using Tegra::Shader::Attribute;
+using Tegra::Shader::PixelImap;
+using Tegra::Shader::Register;
+using namespace VideoCommon::Shader;
+using Operation = const OperationNode&;
+
+constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};
+
+char Swizzle(std::size_t component) {
+ ASSERT(component < 4);
+ return component["xyzw"];
+}
+
+constexpr bool IsGenericAttribute(Attribute::Index index) {
+ return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
+}
+
+u32 GetGenericAttributeIndex(Attribute::Index index) {
+ ASSERT(IsGenericAttribute(index));
+ return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
+}
+
+std::string_view Modifiers(Operation operation) {
+ const auto meta = std::get_if<MetaArithmetic>(&operation.GetMeta());
+ if (meta && meta->precise) {
+ return ".PREC";
+ }
+ return "";
+}
+
+std::string_view GetInputFlags(PixelImap attribute) {
+ switch (attribute) {
+ case PixelImap::Perspective:
+ return "";
+ case PixelImap::Constant:
+ return "FLAT ";
+ case PixelImap::ScreenLinear:
+ return "NOPERSPECTIVE ";
+ case PixelImap::Unused:
+ break;
+ }
+ UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute));
+ return {};
+}
+
+std::string_view ImageType(Tegra::Shader::ImageType image_type) {
+ switch (image_type) {
+ case Tegra::Shader::ImageType::Texture1D:
+ return "1D";
+ case Tegra::Shader::ImageType::TextureBuffer:
+ return "BUFFER";
+ case Tegra::Shader::ImageType::Texture1DArray:
+ return "ARRAY1D";
+ case Tegra::Shader::ImageType::Texture2D:
+ return "2D";
+ case Tegra::Shader::ImageType::Texture2DArray:
+ return "ARRAY2D";
+ case Tegra::Shader::ImageType::Texture3D:
+ return "3D";
+ }
+ UNREACHABLE();
+ return {};
+}
+
+std::string_view StackName(MetaStackClass stack) {
+ switch (stack) {
+ case MetaStackClass::Ssy:
+ return "SSY";
+ case MetaStackClass::Pbk:
+ return "PBK";
+ }
+ UNREACHABLE();
+ return "";
+};
+
+std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+ switch (topology) {
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
+ return "POINTS";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
+ return "LINES";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
+ return "LINES_ADJACENCY";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
+ return "TRIANGLES";
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
+ case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
+ return "TRIANGLES_ADJACENCY";
+ default:
+ UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+ return "POINTS";
+ }
+}
+
+std::string_view TopologyName(Tegra::Shader::OutputTopology topology) {
+ switch (topology) {
+ case Tegra::Shader::OutputTopology::PointList:
+ return "POINTS";
+ case Tegra::Shader::OutputTopology::LineStrip:
+ return "LINE_STRIP";
+ case Tegra::Shader::OutputTopology::TriangleStrip:
+ return "TRIANGLE_STRIP";
+ default:
+ UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology));
+ return "points";
+ }
+}
+
+std::string_view StageInputName(ShaderType stage) {
+ switch (stage) {
+ case ShaderType::Vertex:
+ case ShaderType::Geometry:
+ return "vertex";
+ case ShaderType::Fragment:
+ return "fragment";
+ case ShaderType::Compute:
+ return "invocation";
+ default:
+ UNREACHABLE();
+ return "";
+ }
+}
+
+std::string TextureType(const MetaTexture& meta) {
+ if (meta.sampler.is_buffer) {
+ return "BUFFER";
+ }
+ std::string type;
+ if (meta.sampler.is_shadow) {
+ type += "SHADOW";
+ }
+ if (meta.sampler.is_array) {
+ type += "ARRAY";
+ }
+ type += [&meta] {
+ switch (meta.sampler.type) {
+ case Tegra::Shader::TextureType::Texture1D:
+ return "1D";
+ case Tegra::Shader::TextureType::Texture2D:
+ return "2D";
+ case Tegra::Shader::TextureType::Texture3D:
+ return "3D";
+ case Tegra::Shader::TextureType::TextureCube:
+ return "CUBE";
+ }
+ UNREACHABLE();
+ return "2D";
+ }();
+ return type;
+}
+
+std::string GlobalMemoryName(const GlobalMemoryBase& base) {
+ return fmt::format("gmem{}_{}", base.cbuf_index, base.cbuf_offset);
+}
+
+class ARBDecompiler final {
+public:
+ explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+ ShaderType stage, std::string_view identifier);
+
+ std::string Code() const {
+ return shader_source;
+ }
+
+private:
+ void DeclareHeader();
+ void DeclareVertex();
+ void DeclareGeometry();
+ void DeclareFragment();
+ void DeclareCompute();
+ void DeclareInputAttributes();
+ void DeclareOutputAttributes();
+ void DeclareLocalMemory();
+ void DeclareGlobalMemory();
+ void DeclareConstantBuffers();
+ void DeclareRegisters();
+ void DeclareTemporaries();
+ void DeclarePredicates();
+ void DeclareInternalFlags();
+
+ void InitializeVariables();
+
+ void DecompileAST();
+ void DecompileBranchMode();
+
+ void VisitAST(const ASTNode& node);
+ std::string VisitExpression(const Expr& node);
+
+ void VisitBlock(const NodeBlock& bb);
+
+ std::string Visit(const Node& node);
+
+ std::pair<std::string, std::size_t> BuildCoords(Operation);
+ std::string BuildAoffi(Operation);
+ void Exit();
+
+ std::string Assign(Operation);
+ std::string Select(Operation);
+ std::string FClamp(Operation);
+ std::string FCastHalf0(Operation);
+ std::string FCastHalf1(Operation);
+ std::string FSqrt(Operation);
+ std::string FSwizzleAdd(Operation);
+ std::string HAdd2(Operation);
+ std::string HMul2(Operation);
+ std::string HFma2(Operation);
+ std::string HAbsolute(Operation);
+ std::string HNegate(Operation);
+ std::string HClamp(Operation);
+ std::string HCastFloat(Operation);
+ std::string HUnpack(Operation);
+ std::string HMergeF32(Operation);
+ std::string HMergeH0(Operation);
+ std::string HMergeH1(Operation);
+ std::string HPack2(Operation);
+ std::string LogicalAssign(Operation);
+ std::string LogicalPick2(Operation);
+ std::string LogicalAnd2(Operation);
+ std::string FloatOrdered(Operation);
+ std::string FloatUnordered(Operation);
+ std::string LogicalAddCarry(Operation);
+ std::string Texture(Operation);
+ std::string TextureGather(Operation);
+ std::string TextureQueryDimensions(Operation);
+ std::string TextureQueryLod(Operation);
+ std::string TexelFetch(Operation);
+ std::string TextureGradient(Operation);
+ std::string ImageLoad(Operation);
+ std::string ImageStore(Operation);
+ std::string Branch(Operation);
+ std::string BranchIndirect(Operation);
+ std::string PushFlowStack(Operation);
+ std::string PopFlowStack(Operation);
+ std::string Exit(Operation);
+ std::string Discard(Operation);
+ std::string EmitVertex(Operation);
+ std::string EndPrimitive(Operation);
+ std::string InvocationId(Operation);
+ std::string YNegate(Operation);
+ std::string ThreadId(Operation);
+ std::string ShuffleIndexed(Operation);
+ std::string Barrier(Operation);
+ std::string MemoryBarrierGroup(Operation);
+ std::string MemoryBarrierGlobal(Operation);
+
+ template <const std::string_view& op>
+ std::string Unary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]));
+ return temporary;
+ }
+
+ template <const std::string_view& op>
+ std::string Binary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+ Visit(operation[1]));
+ return temporary;
+ }
+
+ template <const std::string_view& op>
+ std::string Trinary(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("{}{} {}, {}, {}, {};", op, Modifiers(operation), temporary, Visit(operation[0]),
+ Visit(operation[1]), Visit(operation[2]));
+ return temporary;
+ }
+
+ template <const std::string_view& op, bool unordered>
+ std::string FloatComparison(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("TRUNC.U.CC RC.x, {};", Binary<op>(operation));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+
+ const std::string op_a = Visit(operation[0]);
+ const std::string op_b = Visit(operation[1]);
+ if constexpr (unordered) {
+ AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+ AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), -1;", temporary);
+ } else if (op == SNE_F) {
+ AddLine("SNE.F RC.x, {}, {};", op_a, op_a);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), 0;", temporary);
+ AddLine("SNE.F RC.x, {}, {};", op_b, op_b);
+ AddLine("TRUNC.U.CC RC.x, RC.x;");
+ AddLine("MOV.S {} (NE.x), 0;", temporary);
+ }
+ return temporary;
+ }
+
+ template <const std::string_view& op, bool is_nan>
+ std::string HalfComparison(Operation operation) {
+ std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ const std::string op_a = Visit(operation[0]);
+ const std::string op_b = Visit(operation[1]);
+ AddLine("UP2H.F {}, {};", tmp1, op_a);
+ AddLine("UP2H.F {}, {};", tmp2, op_b);
+ AddLine("{} {}, {}, {};", op, tmp1, tmp1, tmp2);
+ AddLine("TRUNC.U.CC RC.xy, {};", tmp1);
+ AddLine("MOV.S {}.xy, {{0, 0, 0, 0}};", tmp1);
+ AddLine("MOV.S {}.x (NE.x), -1;", tmp1);
+ AddLine("MOV.S {}.y (NE.y), -1;", tmp1);
+ if constexpr (is_nan) {
+ AddLine("MOVC.F RC.x, {};", op_a);
+ AddLine("MOV.S {}.x (NAN.x), -1;", tmp1);
+ AddLine("MOVC.F RC.x, {};", op_b);
+ AddLine("MOV.S {}.y (NAN.x), -1;", tmp1);
+ }
+ return tmp1;
+ }
+
+ template <const std::string_view& op, const std::string_view& type>
+ std::string AtomicImage(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t num_coords = operation.GetOperandsCount();
+ const std::size_t num_values = meta.values.size();
+
+ const std::string coord = AllocVectorTemporary();
+ const std::string value = AllocVectorTemporary();
+ for (std::size_t i = 0; i < num_coords; ++i) {
+ AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+ }
+ for (std::size_t i = 0; i < num_values; ++i) {
+ AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+ }
+
+ AddLine("ATOMIM.{}.{} {}.x, {}, {}, image[{}], {};", op, type, coord, value, coord,
+ image_id, ImageType(meta.image.type));
+ return fmt::format("{}.x", coord);
+ }
+
+ template <const std::string_view& op, const std::string_view& type>
+ std::string Atomic(Operation operation) {
+ std::string temporary = AllocTemporary();
+ std::string address;
+ std::string_view opname;
+ if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) {
+ AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+ Visit(gmem->GetBaseAddress()));
+ address = fmt::format("{}[{}]", GlobalMemoryName(gmem->GetDescriptor()), temporary);
+ opname = "ATOMB";
+ } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) {
+ address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress()));
+ opname = "ATOMS";
+ } else {
+ UNREACHABLE();
+ return "{0, 0, 0, 0}";
+ }
+ AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address);
+ return temporary;
+ }
+
+ template <char type>
+ std::string Negate(Operation operation) {
+ std::string temporary = AllocTemporary();
+ if constexpr (type == 'F') {
+ AddLine("MOV.F32 {}, -{};", temporary, Visit(operation[0]));
+ } else {
+ AddLine("MOV.{} {}, -{};", type, temporary, Visit(operation[0]));
+ }
+ return temporary;
+ }
+
+ template <char type>
+ std::string Absolute(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOV.{} {}, |{}|;", type, temporary, Visit(operation[0]));
+ return temporary;
+ }
+
+ template <char type>
+ std::string BitfieldInsert(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[3]));
+ AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[2]));
+ AddLine("BFI.{} {}.x, {}, {}, {};", type, temporary, temporary, Visit(operation[1]),
+ Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ template <char type>
+ std::string BitfieldExtract(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.{} {}.x, {};", type, temporary, Visit(operation[2]));
+ AddLine("MOV.{} {}.y, {};", type, temporary, Visit(operation[1]));
+ AddLine("BFE.{} {}.x, {}, {};", type, temporary, temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ template <char swizzle>
+ std::string LocalInvocationId(Operation) {
+ return fmt::format("invocation.localid.{}", swizzle);
+ }
+
+ template <char swizzle>
+ std::string WorkGroupId(Operation) {
+ return fmt::format("invocation.groupid.{}", swizzle);
+ }
+
+ template <char c1, char c2>
+ std::string ThreadMask(Operation) {
+ return fmt::format("{}.thread{}{}mask", StageInputName(stage), c1, c2);
+ }
+
+ template <typename... Args>
+ void AddExpression(std::string_view text, Args&&... args) {
+ shader_source += fmt::format(text, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ void AddLine(std::string_view text, Args&&... args) {
+ AddExpression(text, std::forward<Args>(args)...);
+ shader_source += '\n';
+ }
+
+ std::string AllocTemporary() {
+ max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+ return fmt::format("T{}.x", num_temporaries++);
+ }
+
+ std::string AllocVectorTemporary() {
+ max_temporaries = std::max(max_temporaries, num_temporaries + 1);
+ return fmt::format("T{}", num_temporaries++);
+ }
+
+ void ResetTemporaries() noexcept {
+ num_temporaries = 0;
+ }
+
+ const Device& device;
+ const ShaderIR& ir;
+ const Registry& registry;
+ const ShaderType stage;
+
+ std::size_t num_temporaries = 0;
+ std::size_t max_temporaries = 0;
+
+ std::string shader_source;
+
+ static constexpr std::string_view ADD_F32 = "ADD.F32";
+ static constexpr std::string_view ADD_S = "ADD.S";
+ static constexpr std::string_view ADD_U = "ADD.U";
+ static constexpr std::string_view MUL_F32 = "MUL.F32";
+ static constexpr std::string_view MUL_S = "MUL.S";
+ static constexpr std::string_view MUL_U = "MUL.U";
+ static constexpr std::string_view DIV_F32 = "DIV.F32";
+ static constexpr std::string_view DIV_S = "DIV.S";
+ static constexpr std::string_view DIV_U = "DIV.U";
+ static constexpr std::string_view MAD_F32 = "MAD.F32";
+ static constexpr std::string_view RSQ_F32 = "RSQ.F32";
+ static constexpr std::string_view COS_F32 = "COS.F32";
+ static constexpr std::string_view SIN_F32 = "SIN.F32";
+ static constexpr std::string_view EX2_F32 = "EX2.F32";
+ static constexpr std::string_view LG2_F32 = "LG2.F32";
+ static constexpr std::string_view SLT_F = "SLT.F32";
+ static constexpr std::string_view SLT_S = "SLT.S";
+ static constexpr std::string_view SLT_U = "SLT.U";
+ static constexpr std::string_view SEQ_F = "SEQ.F32";
+ static constexpr std::string_view SEQ_S = "SEQ.S";
+ static constexpr std::string_view SEQ_U = "SEQ.U";
+ static constexpr std::string_view SLE_F = "SLE.F32";
+ static constexpr std::string_view SLE_S = "SLE.S";
+ static constexpr std::string_view SLE_U = "SLE.U";
+ static constexpr std::string_view SGT_F = "SGT.F32";
+ static constexpr std::string_view SGT_S = "SGT.S";
+ static constexpr std::string_view SGT_U = "SGT.U";
+ static constexpr std::string_view SNE_F = "SNE.F32";
+ static constexpr std::string_view SNE_S = "SNE.S";
+ static constexpr std::string_view SNE_U = "SNE.U";
+ static constexpr std::string_view SGE_F = "SGE.F32";
+ static constexpr std::string_view SGE_S = "SGE.S";
+ static constexpr std::string_view SGE_U = "SGE.U";
+ static constexpr std::string_view AND_S = "AND.S";
+ static constexpr std::string_view AND_U = "AND.U";
+ static constexpr std::string_view TRUNC_F = "TRUNC.F";
+ static constexpr std::string_view TRUNC_S = "TRUNC.S";
+ static constexpr std::string_view TRUNC_U = "TRUNC.U";
+ static constexpr std::string_view SHL_S = "SHL.S";
+ static constexpr std::string_view SHL_U = "SHL.U";
+ static constexpr std::string_view SHR_S = "SHR.S";
+ static constexpr std::string_view SHR_U = "SHR.U";
+ static constexpr std::string_view OR_S = "OR.S";
+ static constexpr std::string_view OR_U = "OR.U";
+ static constexpr std::string_view XOR_S = "XOR.S";
+ static constexpr std::string_view XOR_U = "XOR.U";
+ static constexpr std::string_view NOT_S = "NOT.S";
+ static constexpr std::string_view NOT_U = "NOT.U";
+ static constexpr std::string_view BTC_S = "BTC.S";
+ static constexpr std::string_view BTC_U = "BTC.U";
+ static constexpr std::string_view BTFM_S = "BTFM.S";
+ static constexpr std::string_view BTFM_U = "BTFM.U";
+ static constexpr std::string_view ROUND_F = "ROUND.F";
+ static constexpr std::string_view CEIL_F = "CEIL.F";
+ static constexpr std::string_view FLR_F = "FLR.F";
+ static constexpr std::string_view I2F_S = "I2F.S";
+ static constexpr std::string_view I2F_U = "I2F.U";
+ static constexpr std::string_view MIN_F = "MIN.F";
+ static constexpr std::string_view MIN_S = "MIN.S";
+ static constexpr std::string_view MIN_U = "MIN.U";
+ static constexpr std::string_view MAX_F = "MAX.F";
+ static constexpr std::string_view MAX_S = "MAX.S";
+ static constexpr std::string_view MAX_U = "MAX.U";
+ static constexpr std::string_view MOV_U = "MOV.U";
+ static constexpr std::string_view TGBALLOT_U = "TGBALLOT.U";
+ static constexpr std::string_view TGALL_U = "TGALL.U";
+ static constexpr std::string_view TGANY_U = "TGANY.U";
+ static constexpr std::string_view TGEQ_U = "TGEQ.U";
+ static constexpr std::string_view EXCH = "EXCH";
+ static constexpr std::string_view ADD = "ADD";
+ static constexpr std::string_view MIN = "MIN";
+ static constexpr std::string_view MAX = "MAX";
+ static constexpr std::string_view AND = "AND";
+ static constexpr std::string_view OR = "OR";
+ static constexpr std::string_view XOR = "XOR";
+ static constexpr std::string_view U32 = "U32";
+ static constexpr std::string_view S32 = "S32";
+
+ static constexpr std::size_t NUM_ENTRIES = static_cast<std::size_t>(OperationCode::Amount);
+ using DecompilerType = std::string (ARBDecompiler::*)(Operation);
+ static constexpr std::array<DecompilerType, NUM_ENTRIES> OPERATION_DECOMPILERS = {
+ &ARBDecompiler::Assign,
+
+ &ARBDecompiler::Select,
+
+ &ARBDecompiler::Binary<ADD_F32>,
+ &ARBDecompiler::Binary<MUL_F32>,
+ &ARBDecompiler::Binary<DIV_F32>,
+ &ARBDecompiler::Trinary<MAD_F32>,
+ &ARBDecompiler::Negate<'F'>,
+ &ARBDecompiler::Absolute<'F'>,
+ &ARBDecompiler::FClamp,
+ &ARBDecompiler::FCastHalf0,
+ &ARBDecompiler::FCastHalf1,
+ &ARBDecompiler::Binary<MIN_F>,
+ &ARBDecompiler::Binary<MAX_F>,
+ &ARBDecompiler::Unary<COS_F32>,
+ &ARBDecompiler::Unary<SIN_F32>,
+ &ARBDecompiler::Unary<EX2_F32>,
+ &ARBDecompiler::Unary<LG2_F32>,
+ &ARBDecompiler::Unary<RSQ_F32>,
+ &ARBDecompiler::FSqrt,
+ &ARBDecompiler::Unary<ROUND_F>,
+ &ARBDecompiler::Unary<FLR_F>,
+ &ARBDecompiler::Unary<CEIL_F>,
+ &ARBDecompiler::Unary<TRUNC_F>,
+ &ARBDecompiler::Unary<I2F_S>,
+ &ARBDecompiler::Unary<I2F_U>,
+ &ARBDecompiler::FSwizzleAdd,
+
+ &ARBDecompiler::Binary<ADD_S>,
+ &ARBDecompiler::Binary<MUL_S>,
+ &ARBDecompiler::Binary<DIV_S>,
+ &ARBDecompiler::Negate<'S'>,
+ &ARBDecompiler::Absolute<'S'>,
+ &ARBDecompiler::Binary<MIN_S>,
+ &ARBDecompiler::Binary<MAX_S>,
+
+ &ARBDecompiler::Unary<TRUNC_S>,
+ &ARBDecompiler::Unary<MOV_U>,
+ &ARBDecompiler::Binary<SHL_S>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<SHR_S>,
+ &ARBDecompiler::Binary<AND_S>,
+ &ARBDecompiler::Binary<OR_S>,
+ &ARBDecompiler::Binary<XOR_S>,
+ &ARBDecompiler::Unary<NOT_S>,
+ &ARBDecompiler::BitfieldInsert<'S'>,
+ &ARBDecompiler::BitfieldExtract<'S'>,
+ &ARBDecompiler::Unary<BTC_S>,
+ &ARBDecompiler::Unary<BTFM_S>,
+
+ &ARBDecompiler::Binary<ADD_U>,
+ &ARBDecompiler::Binary<MUL_U>,
+ &ARBDecompiler::Binary<DIV_U>,
+ &ARBDecompiler::Binary<MIN_U>,
+ &ARBDecompiler::Binary<MAX_U>,
+ &ARBDecompiler::Unary<TRUNC_U>,
+ &ARBDecompiler::Unary<MOV_U>,
+ &ARBDecompiler::Binary<SHL_U>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<SHR_U>,
+ &ARBDecompiler::Binary<AND_U>,
+ &ARBDecompiler::Binary<OR_U>,
+ &ARBDecompiler::Binary<XOR_U>,
+ &ARBDecompiler::Unary<NOT_U>,
+ &ARBDecompiler::BitfieldInsert<'U'>,
+ &ARBDecompiler::BitfieldExtract<'U'>,
+ &ARBDecompiler::Unary<BTC_U>,
+ &ARBDecompiler::Unary<BTFM_U>,
+
+ &ARBDecompiler::HAdd2,
+ &ARBDecompiler::HMul2,
+ &ARBDecompiler::HFma2,
+ &ARBDecompiler::HAbsolute,
+ &ARBDecompiler::HNegate,
+ &ARBDecompiler::HClamp,
+ &ARBDecompiler::HCastFloat,
+ &ARBDecompiler::HUnpack,
+ &ARBDecompiler::HMergeF32,
+ &ARBDecompiler::HMergeH0,
+ &ARBDecompiler::HMergeH1,
+ &ARBDecompiler::HPack2,
+
+ &ARBDecompiler::LogicalAssign,
+ &ARBDecompiler::Binary<AND_U>,
+ &ARBDecompiler::Binary<OR_U>,
+ &ARBDecompiler::Binary<XOR_U>,
+ &ARBDecompiler::Unary<NOT_U>,
+ &ARBDecompiler::LogicalPick2,
+ &ARBDecompiler::LogicalAnd2,
+
+ &ARBDecompiler::FloatComparison<SLT_F, false>,
+ &ARBDecompiler::FloatComparison<SEQ_F, false>,
+ &ARBDecompiler::FloatComparison<SLE_F, false>,
+ &ARBDecompiler::FloatComparison<SGT_F, false>,
+ &ARBDecompiler::FloatComparison<SNE_F, false>,
+ &ARBDecompiler::FloatComparison<SGE_F, false>,
+ &ARBDecompiler::FloatOrdered,
+ &ARBDecompiler::FloatUnordered,
+ &ARBDecompiler::FloatComparison<SLT_F, true>,
+ &ARBDecompiler::FloatComparison<SEQ_F, true>,
+ &ARBDecompiler::FloatComparison<SLE_F, true>,
+ &ARBDecompiler::FloatComparison<SGT_F, true>,
+ &ARBDecompiler::FloatComparison<SNE_F, true>,
+ &ARBDecompiler::FloatComparison<SGE_F, true>,
+
+ &ARBDecompiler::Binary<SLT_S>,
+ &ARBDecompiler::Binary<SEQ_S>,
+ &ARBDecompiler::Binary<SLE_S>,
+ &ARBDecompiler::Binary<SGT_S>,
+ &ARBDecompiler::Binary<SNE_S>,
+ &ARBDecompiler::Binary<SGE_S>,
+
+ &ARBDecompiler::Binary<SLT_U>,
+ &ARBDecompiler::Binary<SEQ_U>,
+ &ARBDecompiler::Binary<SLE_U>,
+ &ARBDecompiler::Binary<SGT_U>,
+ &ARBDecompiler::Binary<SNE_U>,
+ &ARBDecompiler::Binary<SGE_U>,
+
+ &ARBDecompiler::LogicalAddCarry,
+
+ &ARBDecompiler::HalfComparison<SLT_F, false>,
+ &ARBDecompiler::HalfComparison<SEQ_F, false>,
+ &ARBDecompiler::HalfComparison<SLE_F, false>,
+ &ARBDecompiler::HalfComparison<SGT_F, false>,
+ &ARBDecompiler::HalfComparison<SNE_F, false>,
+ &ARBDecompiler::HalfComparison<SGE_F, false>,
+ &ARBDecompiler::HalfComparison<SLT_F, true>,
+ &ARBDecompiler::HalfComparison<SEQ_F, true>,
+ &ARBDecompiler::HalfComparison<SLE_F, true>,
+ &ARBDecompiler::HalfComparison<SGT_F, true>,
+ &ARBDecompiler::HalfComparison<SNE_F, true>,
+ &ARBDecompiler::HalfComparison<SGE_F, true>,
+
+ &ARBDecompiler::Texture,
+ &ARBDecompiler::Texture,
+ &ARBDecompiler::TextureGather,
+ &ARBDecompiler::TextureQueryDimensions,
+ &ARBDecompiler::TextureQueryLod,
+ &ARBDecompiler::TexelFetch,
+ &ARBDecompiler::TextureGradient,
+
+ &ARBDecompiler::ImageLoad,
+ &ARBDecompiler::ImageStore,
+
+ &ARBDecompiler::AtomicImage<ADD, U32>,
+ &ARBDecompiler::AtomicImage<AND, U32>,
+ &ARBDecompiler::AtomicImage<OR, U32>,
+ &ARBDecompiler::AtomicImage<XOR, U32>,
+ &ARBDecompiler::AtomicImage<EXCH, U32>,
+
+ &ARBDecompiler::Atomic<EXCH, U32>,
+ &ARBDecompiler::Atomic<ADD, U32>,
+ &ARBDecompiler::Atomic<MIN, U32>,
+ &ARBDecompiler::Atomic<MAX, U32>,
+ &ARBDecompiler::Atomic<AND, U32>,
+ &ARBDecompiler::Atomic<OR, U32>,
+ &ARBDecompiler::Atomic<XOR, U32>,
+
+ &ARBDecompiler::Atomic<EXCH, S32>,
+ &ARBDecompiler::Atomic<ADD, S32>,
+ &ARBDecompiler::Atomic<MIN, S32>,
+ &ARBDecompiler::Atomic<MAX, S32>,
+ &ARBDecompiler::Atomic<AND, S32>,
+ &ARBDecompiler::Atomic<OR, S32>,
+ &ARBDecompiler::Atomic<XOR, S32>,
+
+ &ARBDecompiler::Atomic<ADD, U32>,
+ &ARBDecompiler::Atomic<MIN, U32>,
+ &ARBDecompiler::Atomic<MAX, U32>,
+ &ARBDecompiler::Atomic<AND, U32>,
+ &ARBDecompiler::Atomic<OR, U32>,
+ &ARBDecompiler::Atomic<XOR, U32>,
+
+ &ARBDecompiler::Atomic<ADD, S32>,
+ &ARBDecompiler::Atomic<MIN, S32>,
+ &ARBDecompiler::Atomic<MAX, S32>,
+ &ARBDecompiler::Atomic<AND, S32>,
+ &ARBDecompiler::Atomic<OR, S32>,
+ &ARBDecompiler::Atomic<XOR, S32>,
+
+ &ARBDecompiler::Branch,
+ &ARBDecompiler::BranchIndirect,
+ &ARBDecompiler::PushFlowStack,
+ &ARBDecompiler::PopFlowStack,
+ &ARBDecompiler::Exit,
+ &ARBDecompiler::Discard,
+
+ &ARBDecompiler::EmitVertex,
+ &ARBDecompiler::EndPrimitive,
+
+ &ARBDecompiler::InvocationId,
+ &ARBDecompiler::YNegate,
+ &ARBDecompiler::LocalInvocationId<'x'>,
+ &ARBDecompiler::LocalInvocationId<'y'>,
+ &ARBDecompiler::LocalInvocationId<'z'>,
+ &ARBDecompiler::WorkGroupId<'x'>,
+ &ARBDecompiler::WorkGroupId<'y'>,
+ &ARBDecompiler::WorkGroupId<'z'>,
+
+ &ARBDecompiler::Unary<TGBALLOT_U>,
+ &ARBDecompiler::Unary<TGALL_U>,
+ &ARBDecompiler::Unary<TGANY_U>,
+ &ARBDecompiler::Unary<TGEQ_U>,
+
+ &ARBDecompiler::ThreadId,
+ &ARBDecompiler::ThreadMask<'e', 'q'>,
+ &ARBDecompiler::ThreadMask<'g', 'e'>,
+ &ARBDecompiler::ThreadMask<'g', 't'>,
+ &ARBDecompiler::ThreadMask<'l', 'e'>,
+ &ARBDecompiler::ThreadMask<'l', 't'>,
+ &ARBDecompiler::ShuffleIndexed,
+
+ &ARBDecompiler::Barrier,
+ &ARBDecompiler::MemoryBarrierGroup,
+ &ARBDecompiler::MemoryBarrierGlobal,
+ };
+};
+
+ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+ ShaderType stage, std::string_view identifier)
+ : device{device}, ir{ir}, registry{registry}, stage{stage} {
+ AddLine("TEMP RC;");
+ AddLine("TEMP FSWZA[4];");
+ AddLine("TEMP FSWZB[4];");
+ if (ir.IsDecompiled()) {
+ DecompileAST();
+ } else {
+ DecompileBranchMode();
+ }
+ AddLine("END");
+
+ const std::string code = std::move(shader_source);
+ DeclareHeader();
+ DeclareVertex();
+ DeclareGeometry();
+ DeclareFragment();
+ DeclareCompute();
+ DeclareInputAttributes();
+ DeclareOutputAttributes();
+ DeclareLocalMemory();
+ DeclareGlobalMemory();
+ DeclareConstantBuffers();
+ DeclareRegisters();
+ DeclareTemporaries();
+ DeclarePredicates();
+ DeclareInternalFlags();
+
+ shader_source += code;
+}
+
+std::string_view HeaderStageName(ShaderType stage) {
+ switch (stage) {
+ case ShaderType::Vertex:
+ return "vp";
+ case ShaderType::Geometry:
+ return "gp";
+ case ShaderType::Fragment:
+ return "fp";
+ case ShaderType::Compute:
+ return "cp";
+ default:
+ UNREACHABLE();
+ return "";
+ }
+}
+
+void ARBDecompiler::DeclareHeader() {
+ AddLine("!!NV{}5.0", HeaderStageName(stage));
+ // Enabling this allows us to cheat on some instructions like TXL with SHADOWARRAY2D
+ AddLine("OPTION NV_internal;");
+ AddLine("OPTION NV_gpu_program_fp64;");
+ AddLine("OPTION NV_shader_storage_buffer;");
+ AddLine("OPTION NV_shader_thread_group;");
+ if (ir.UsesWarps() && device.HasWarpIntrinsics()) {
+ AddLine("OPTION NV_shader_thread_shuffle;");
+ }
+ if (stage == ShaderType::Vertex) {
+ if (device.HasNvViewportArray2()) {
+ AddLine("OPTION NV_viewport_array2;");
+ }
+ }
+ if (stage == ShaderType::Fragment) {
+ AddLine("OPTION ARB_draw_buffers;");
+ }
+ if (device.HasImageLoadFormatted()) {
+ AddLine("OPTION EXT_shader_image_load_formatted;");
+ }
+}
+
+void ARBDecompiler::DeclareVertex() {
+ if (stage != ShaderType::Vertex) {
+ return;
+ }
+ AddLine("OUTPUT result_clip[] = {{ result.clip[0..7] }};");
+}
+
+void ARBDecompiler::DeclareGeometry() {
+ if (stage != ShaderType::Geometry) {
+ return;
+ }
+ const auto& info = registry.GetGraphicsInfo();
+ const auto& header = ir.GetHeader();
+ AddLine("PRIMITIVE_IN {};", PrimitiveDescription(info.primitive_topology));
+ AddLine("PRIMITIVE_OUT {};", TopologyName(header.common3.output_topology));
+ AddLine("VERTICES_OUT {};", header.common4.max_output_vertices.Value());
+ AddLine("ATTRIB vertex_position = vertex.position;");
+}
+
+void ARBDecompiler::DeclareFragment() {
+ if (stage != ShaderType::Fragment) {
+ return;
+ }
+ AddLine("OUTPUT result_color7 = result.color[7];");
+ AddLine("OUTPUT result_color6 = result.color[6];");
+ AddLine("OUTPUT result_color5 = result.color[5];");
+ AddLine("OUTPUT result_color4 = result.color[4];");
+ AddLine("OUTPUT result_color3 = result.color[3];");
+ AddLine("OUTPUT result_color2 = result.color[2];");
+ AddLine("OUTPUT result_color1 = result.color[1];");
+ AddLine("OUTPUT result_color0 = result.color;");
+}
+
+void ARBDecompiler::DeclareCompute() {
+ if (stage != ShaderType::Compute) {
+ return;
+ }
+ const ComputeInfo& info = registry.GetComputeInfo();
+ AddLine("GROUP_SIZE {} {} {};", info.workgroup_size[0], info.workgroup_size[1],
+ info.workgroup_size[2]);
+ if (info.shared_memory_size_in_words > 0) {
+ const u32 size_in_bytes = info.shared_memory_size_in_words * 4;
+ AddLine("SHARED_MEMORY {};", size_in_bytes);
+ AddLine("SHARED shared_mem[] = {{program.sharedmem}};");
+ }
+}
+
+void ARBDecompiler::DeclareInputAttributes() {
+ if (stage == ShaderType::Compute) {
+ return;
+ }
+ const std::string_view stage_name = StageInputName(stage);
+ for (const auto attribute : ir.GetInputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+
+ std::string_view suffix;
+ if (stage == ShaderType::Fragment) {
+ const auto input_mode{ir.GetHeader().ps.GetPixelImap(index)};
+ if (input_mode == PixelImap::Unused) {
+ return;
+ }
+ suffix = GetInputFlags(input_mode);
+ }
+ AddLine("{}ATTRIB in_attr{}[] = {{ {}.attrib[{}..{}] }};", suffix, index, stage_name, index,
+ index);
+ }
+}
+
+void ARBDecompiler::DeclareOutputAttributes() {
+ if (stage == ShaderType::Compute) {
+ return;
+ }
+ for (const auto attribute : ir.GetOutputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+ AddLine("OUTPUT out_attr{}[] = {{ result.attrib[{}..{}] }};", index, index, index);
+ }
+}
+
+void ARBDecompiler::DeclareLocalMemory() {
+ u64 size = 0;
+ if (stage == ShaderType::Compute) {
+ size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+ } else {
+ size = ir.GetHeader().GetLocalMemorySize();
+ }
+ if (size == 0) {
+ return;
+ }
+ const u64 element_count = Common::AlignUp(size, 4) / 4;
+ AddLine("TEMP lmem[{}];", element_count);
+}
+
+void ARBDecompiler::DeclareGlobalMemory() {
+ u32 binding = 0; // device.GetBaseBindings(stage).shader_storage_buffer;
+ for (const auto& pair : ir.GetGlobalMemory()) {
+ const auto& base = pair.first;
+ AddLine("STORAGE {}[] = {{ program.storage[{}] }};", GlobalMemoryName(base), binding);
+ ++binding;
+ }
+}
+
+void ARBDecompiler::DeclareConstantBuffers() {
+ u32 binding = 0;
+ for (const auto& cbuf : ir.GetConstantBuffers()) {
+ AddLine("CBUFFER cbuf{}[] = {{ program.buffer[{}] }};", cbuf.first, binding);
+ ++binding;
+ }
+}
+
+void ARBDecompiler::DeclareRegisters() {
+ for (const u32 gpr : ir.GetRegisters()) {
+ AddLine("TEMP R{};", gpr);
+ }
+}
+
+void ARBDecompiler::DeclareTemporaries() {
+ for (std::size_t i = 0; i < max_temporaries; ++i) {
+ AddLine("TEMP T{};", i);
+ }
+}
+
+void ARBDecompiler::DeclarePredicates() {
+ for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+ AddLine("TEMP P{};", static_cast<u64>(pred));
+ }
+}
+
+void ARBDecompiler::DeclareInternalFlags() {
+ for (const char* name : INTERNAL_FLAG_NAMES) {
+ AddLine("TEMP {};", name);
+ }
+}
+
+void ARBDecompiler::InitializeVariables() {
+ AddLine("MOV.F32 FSWZA[0], -1;");
+ AddLine("MOV.F32 FSWZA[1], 1;");
+ AddLine("MOV.F32 FSWZA[2], -1;");
+ AddLine("MOV.F32 FSWZA[3], 0;");
+ AddLine("MOV.F32 FSWZB[0], -1;");
+ AddLine("MOV.F32 FSWZB[1], -1;");
+ AddLine("MOV.F32 FSWZB[2], 1;");
+ AddLine("MOV.F32 FSWZB[3], -1;");
+
+ if (stage == ShaderType::Vertex || stage == ShaderType::Geometry) {
+ AddLine("MOV.F result.position, {{0, 0, 0, 1}};");
+ }
+ for (const auto attribute : ir.GetOutputAttributes()) {
+ if (!IsGenericAttribute(attribute)) {
+ continue;
+ }
+ const u32 index = GetGenericAttributeIndex(attribute);
+ AddLine("MOV.F result.attrib[{}], {{0, 0, 0, 1}};", index);
+ }
+ for (const u32 gpr : ir.GetRegisters()) {
+ AddLine("MOV.F R{}, {{0, 0, 0, 0}};", gpr);
+ }
+ for (const Tegra::Shader::Pred pred : ir.GetPredicates()) {
+ AddLine("MOV.U P{}, {{0, 0, 0, 0}};", static_cast<u64>(pred));
+ }
+}
+
+void ARBDecompiler::DecompileAST() {
+ const u32 num_flow_variables = ir.GetASTNumVariables();
+ for (u32 i = 0; i < num_flow_variables; ++i) {
+ AddLine("TEMP F{};", i);
+ }
+ for (u32 i = 0; i < num_flow_variables; ++i) {
+ AddLine("MOV.U F{}, {{0, 0, 0, 0}};", i);
+ }
+
+ InitializeVariables();
+
+ VisitAST(ir.GetASTProgram());
+}
+
+void ARBDecompiler::DecompileBranchMode() {
+ static constexpr u32 FLOW_STACK_SIZE = 20;
+ if (!ir.IsFlowStackDisabled()) {
+ AddLine("TEMP SSY[{}];", FLOW_STACK_SIZE);
+ AddLine("TEMP PBK[{}];", FLOW_STACK_SIZE);
+ AddLine("TEMP SSY_TOP;");
+ AddLine("TEMP PBK_TOP;");
+ }
+
+ AddLine("TEMP PC;");
+
+ if (!ir.IsFlowStackDisabled()) {
+ AddLine("MOV.U SSY_TOP.x, 0;");
+ AddLine("MOV.U PBK_TOP.x, 0;");
+ }
+
+ InitializeVariables();
+
+ const auto basic_block_end = ir.GetBasicBlocks().end();
+ auto basic_block_it = ir.GetBasicBlocks().begin();
+ const u32 first_address = basic_block_it->first;
+ AddLine("MOV.U PC.x, {};", first_address);
+
+ AddLine("REP;");
+
+ std::size_t num_blocks = 0;
+ while (basic_block_it != basic_block_end) {
+ const auto& [address, bb] = *basic_block_it;
+ ++num_blocks;
+
+ AddLine("SEQ.S.CC RC.x, PC.x, {};", address);
+ AddLine("IF NE.x;");
+
+ VisitBlock(bb);
+
+ ++basic_block_it;
+
+ if (basic_block_it != basic_block_end) {
+ const auto op = std::get_if<OperationNode>(&*bb[bb.size() - 1]);
+ if (!op || op->GetCode() != OperationCode::Branch) {
+ const u32 next_address = basic_block_it->first;
+ AddLine("MOV.U PC.x, {};", next_address);
+ AddLine("CONT;");
+ }
+ }
+
+ AddLine("ELSE;");
+ }
+ AddLine("RET;");
+ while (num_blocks--) {
+ AddLine("ENDIF;");
+ }
+
+ AddLine("ENDREP;");
+}
+
+void ARBDecompiler::VisitAST(const ASTNode& node) {
+ if (const auto ast = std::get_if<ASTProgram>(&*node->GetInnerData())) {
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) {
+ const std::string condition = VisitExpression(ast->condition);
+ ResetTemporaries();
+
+ AddLine("MOVC.U RC.x, {};", condition);
+ AddLine("IF NE.x;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ AddLine("ENDIF;");
+ } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) {
+ AddLine("ELSE;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) {
+ VisitBlock(ast->nodes);
+ } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) {
+ AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition));
+ ResetTemporaries();
+ } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) {
+ const std::string condition = VisitExpression(ast->condition);
+ ResetTemporaries();
+ AddLine("REP;");
+ for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) {
+ VisitAST(current);
+ }
+ AddLine("MOVC.U RC.x, {};", condition);
+ AddLine("BRK (NE.x);");
+ AddLine("ENDREP;");
+ } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) {
+ const bool is_true = ExprIsTrue(ast->condition);
+ if (!is_true) {
+ AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+ AddLine("IF NE.x;");
+ ResetTemporaries();
+ }
+ if (ast->kills) {
+ AddLine("KIL TR;");
+ } else {
+ Exit();
+ }
+ if (!is_true) {
+ AddLine("ENDIF;");
+ }
+ } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) {
+ if (ExprIsTrue(ast->condition)) {
+ AddLine("BRK;");
+ } else {
+ AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition));
+ AddLine("BRK (NE.x);");
+ ResetTemporaries();
+ }
+ } else if (std::holds_alternative<ASTLabel>(*node->GetInnerData())) {
+ // Nothing to do
+ } else {
+ UNREACHABLE();
+ }
+}
+
+std::string ARBDecompiler::VisitExpression(const Expr& node) {
+ if (const auto expr = std::get_if<ExprAnd>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("AND.U {}, {}, {};", result, VisitExpression(expr->operand1),
+ VisitExpression(expr->operand2));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprOr>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("OR.U {}, {}, {};", result, VisitExpression(expr->operand1),
+ VisitExpression(expr->operand2));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprNot>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("CMP.S {}, {}, 0, -1;", result, VisitExpression(expr->operand1));
+ return result;
+ }
+ if (const auto expr = std::get_if<ExprPredicate>(&*node)) {
+ return fmt::format("P{}.x", static_cast<u64>(expr->predicate));
+ }
+ if (const auto expr = std::get_if<ExprCondCode>(&*node)) {
+ return Visit(ir.GetConditionCode(expr->cc));
+ }
+ if (const auto expr = std::get_if<ExprVar>(&*node)) {
+ return fmt::format("F{}.x", expr->var_index);
+ }
+ if (const auto expr = std::get_if<ExprBoolean>(&*node)) {
+ return expr->value ? "0xffffffff" : "0";
+ }
+ if (const auto expr = std::get_if<ExprGprEqual>(&*node)) {
+ std::string result = AllocTemporary();
+ AddLine("SEQ.U {}, R{}.x, {};", result, expr->gpr, expr->value);
+ return result;
+ }
+ UNREACHABLE();
+ return "0";
+}
+
+void ARBDecompiler::VisitBlock(const NodeBlock& bb) {
+ for (const auto& node : bb) {
+ Visit(node);
+ }
+}
+
+std::string ARBDecompiler::Visit(const Node& node) {
+ if (const auto operation = std::get_if<OperationNode>(&*node)) {
+ if (const auto amend_index = operation->GetAmendIndex()) {
+ Visit(ir.GetAmendNode(*amend_index));
+ }
+ const std::size_t index = static_cast<std::size_t>(operation->GetCode());
+ if (index >= OPERATION_DECOMPILERS.size()) {
+ UNREACHABLE_MSG("Out of bounds operation: {}", index);
+ return {};
+ }
+ const auto decompiler = OPERATION_DECOMPILERS[index];
+ if (decompiler == nullptr) {
+ UNREACHABLE_MSG("Undefined operation: {}", index);
+ return {};
+ }
+ return (this->*decompiler)(*operation);
+ }
+
+ if (const auto gpr = std::get_if<GprNode>(&*node)) {
+ const u32 index = gpr->GetIndex();
+ if (index == Register::ZeroIndex) {
+ return "{0, 0, 0, 0}.x";
+ }
+ return fmt::format("R{}.x", index);
+ }
+
+ if (const auto cv = std::get_if<CustomVarNode>(&*node)) {
+ return fmt::format("CV{}.x", cv->GetIndex());
+ }
+
+ if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOV.U {}, {};", temporary, immediate->GetValue());
+ return temporary;
+ }
+
+ if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ switch (const auto index = predicate->GetIndex(); index) {
+ case Tegra::Shader::Pred::UnusedIndex:
+ AddLine("MOV.S {}, -1;", temporary);
+ break;
+ case Tegra::Shader::Pred::NeverExecute:
+ AddLine("MOV.S {}, 0;", temporary);
+ break;
+ default:
+ AddLine("MOV.S {}, P{}.x;", temporary, static_cast<u64>(index));
+ break;
+ }
+ if (predicate->IsNegated()) {
+ AddLine("CMP.S {}, {}, 0, -1;", temporary, temporary);
+ }
+ return temporary;
+ }
+
+ if (const auto abuf = std::get_if<AbufNode>(&*node)) {
+ if (abuf->IsPhysicalBuffer()) {
+ UNIMPLEMENTED_MSG("Physical buffers are not implemented");
+ return "{0, 0, 0, 0}.x";
+ }
+
+ const auto buffer_index = [this, &abuf]() -> std::string {
+ if (stage != ShaderType::Geometry) {
+ return "";
+ }
+ return fmt::format("[{}]", Visit(abuf->GetBuffer()));
+ };
+
+ const Attribute::Index index = abuf->GetIndex();
+ const u32 element = abuf->GetElement();
+ const char swizzle = Swizzle(element);
+ switch (index) {
+ case Attribute::Index::Position: {
+ if (stage == ShaderType::Geometry) {
+ return fmt::format("{}_position[{}].{}", StageInputName(stage),
+ Visit(abuf->GetBuffer()), swizzle);
+ } else {
+ return fmt::format("{}.position.{}", StageInputName(stage), swizzle);
+ }
+ }
+ case Attribute::Index::TessCoordInstanceIDVertexID:
+ ASSERT(stage == ShaderType::Vertex);
+ switch (element) {
+ case 2:
+ return "vertex.instance";
+ case 3:
+ return "vertex.id";
+ }
+ UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
+ break;
+ case Attribute::Index::PointCoord:
+ switch (element) {
+ case 0:
+ return "fragment.pointcoord.x";
+ case 1:
+ return "fragment.pointcoord.y";
+ }
+ UNIMPLEMENTED();
+ break;
+ case Attribute::Index::FrontFacing: {
+ ASSERT(stage == ShaderType::Fragment);
+ ASSERT(element == 3);
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("SGT.S RC.x, fragment.facing, {{0, 0, 0, 0}};");
+ AddLine("MOV.U.CC RC.x, -RC;");
+ AddLine("MOV.S {}.x, 0;", temporary);
+ AddLine("MOV.S {}.x (NE.x), -1;", temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ default:
+ if (IsGenericAttribute(index)) {
+ if (stage == ShaderType::Geometry) {
+ return fmt::format("in_attr{}[{}][0].{}", GetGenericAttributeIndex(index),
+ Visit(abuf->GetBuffer()), swizzle);
+ } else {
+ return fmt::format("{}.attrib[{}].{}", StageInputName(stage),
+ GetGenericAttributeIndex(index), swizzle);
+ }
+ }
+ UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index));
+ break;
+ }
+ return "{0, 0, 0, 0}.x";
+ }
+
+ if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
+ std::string offset_string;
+ const auto& offset = cbuf->GetOffset();
+ if (const auto imm = std::get_if<ImmediateNode>(&*offset)) {
+ offset_string = std::to_string(imm->GetValue());
+ } else {
+ offset_string = Visit(offset);
+ }
+ std::string temporary = AllocTemporary();
+ AddLine("LDC.F32 {}, cbuf{}[{}];", temporary, cbuf->GetIndex(), offset_string);
+ return temporary;
+ }
+
+ if (const auto gmem = std::get_if<GmemNode>(&*node)) {
+ std::string temporary = AllocTemporary();
+ AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+ Visit(gmem->GetBaseAddress()));
+ AddLine("LDB.U32 {}, {}[{}];", temporary, GlobalMemoryName(gmem->GetDescriptor()),
+ temporary);
+ return temporary;
+ }
+
+ if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+ std::string temporary = Visit(lmem->GetAddress());
+ AddLine("SHR.U {}, {}, 2;", temporary, temporary);
+ AddLine("MOV.U {}, lmem[{}].x;", temporary, temporary);
+ return temporary;
+ }
+
+ if (const auto smem = std::get_if<SmemNode>(&*node)) {
+ std::string temporary = Visit(smem->GetAddress());
+ AddLine("LDS.U32 {}, shared_mem[{}];", temporary, temporary);
+ return temporary;
+ }
+
+ if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
+ const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+ return fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+ }
+
+ if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+ if (const auto amend_index = conditional->GetAmendIndex()) {
+ Visit(ir.GetAmendNode(*amend_index));
+ }
+ AddLine("MOVC.U RC.x, {};", Visit(conditional->GetCondition()));
+ AddLine("IF NE.x;");
+ VisitBlock(conditional->GetCode());
+ AddLine("ENDIF;");
+ return {};
+ }
+
+ if (const auto cmt = std::get_if<CommentNode>(&*node)) {
+ // Uncommenting this will generate invalid code. GLASM lacks comments.
+ // AddLine("// {}", cmt->GetText());
+ return {};
+ }
+
+ UNIMPLEMENTED();
+ return {};
+}
+
+std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ UNIMPLEMENTED_IF(meta.sampler.is_indexed);
+ UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
+ meta.sampler.type == Tegra::Shader::TextureType::TextureCube);
+
+ const std::size_t count = operation.GetOperandsCount();
+ std::string temporary = AllocVectorTemporary();
+ std::size_t i = 0;
+ for (; i < count; ++i) {
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ if (meta.sampler.is_array) {
+ AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+ }
+ if (meta.sampler.is_shadow) {
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+ }
+ return {std::move(temporary), i};
+}
+
+std::string ARBDecompiler::BuildAoffi(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ if (meta.aoffi.empty()) {
+ return {};
+ }
+ const std::string temporary = AllocVectorTemporary();
+ std::size_t i = 0;
+ for (auto& node : meta.aoffi) {
+ AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i++), Visit(node));
+ }
+ return fmt::format(", offset({})", temporary);
+}
+
+void ARBDecompiler::Exit() {
+ if (stage != ShaderType::Fragment) {
+ AddLine("RET;");
+ return;
+ }
+
+ const auto safe_get_register = [this](u32 reg) -> std::string {
+ // TODO(Rodrigo): Replace with contains once C++20 releases
+ const auto& used_registers = ir.GetRegisters();
+ if (used_registers.find(reg) != used_registers.end()) {
+ return fmt::format("R{}.x", reg);
+ }
+ return "{0, 0, 0, 0}.x";
+ };
+
+ const auto& header = ir.GetHeader();
+ u32 current_reg = 0;
+ for (u32 rt = 0; rt < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; ++rt) {
+ for (u32 component = 0; component < 4; ++component) {
+ if (!header.ps.IsColorComponentOutputEnabled(rt, component)) {
+ continue;
+ }
+ AddLine("MOV.F result_color{}.{}, {};", rt, Swizzle(component),
+ safe_get_register(current_reg));
+ ++current_reg;
+ }
+ }
+ if (header.ps.omap.depth) {
+ AddLine("MOV.F result.depth.z, {};", safe_get_register(current_reg + 1));
+ }
+
+ AddLine("RET;");
+}
+
+std::string ARBDecompiler::Assign(Operation operation) {
+ const Node& dest = operation[0];
+ const Node& src = operation[1];
+
+ std::string dest_name;
+ if (const auto gpr = std::get_if<GprNode>(&*dest)) {
+ if (gpr->GetIndex() == Register::ZeroIndex) {
+ // Writing to Register::ZeroIndex is a no op
+ return {};
+ }
+ dest_name = fmt::format("R{}.x", gpr->GetIndex());
+ } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
+ const u32 element = abuf->GetElement();
+ const char swizzle = Swizzle(element);
+ switch (const Attribute::Index index = abuf->GetIndex()) {
+ case Attribute::Index::Position:
+ dest_name = fmt::format("result.position.{}", swizzle);
+ break;
+ case Attribute::Index::LayerViewportPointSize:
+ switch (element) {
+ case 0:
+ UNIMPLEMENTED();
+ return {};
+ case 1:
+ case 2:
+ if (!device.HasNvViewportArray2()) {
+ LOG_ERROR(
+ Render_OpenGL,
+ "NV_viewport_array2 is missing. Maxwell gen 2 or better is required.");
+ return {};
+ }
+ dest_name = element == 1 ? "result.layer.x" : "result.viewport.x";
+ break;
+ case 3:
+ dest_name = "result.pointsize.x";
+ break;
+ }
+ break;
+ case Attribute::Index::ClipDistances0123:
+ dest_name = fmt::format("result.clip[{}].x", element);
+ break;
+ case Attribute::Index::ClipDistances4567:
+ dest_name = fmt::format("result.clip[{}].x", element + 4);
+ break;
+ default:
+ if (!IsGenericAttribute(index)) {
+ UNREACHABLE();
+ return {};
+ }
+ dest_name =
+ fmt::format("result.attrib[{}].{}", GetGenericAttributeIndex(index), swizzle);
+ break;
+ }
+ } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+ const std::string address = Visit(lmem->GetAddress());
+ AddLine("SHR.U {}, {}, 2;", address, address);
+ dest_name = fmt::format("lmem[{}].x", address);
+ } else if (const auto smem = std::get_if<SmemNode>(&*dest)) {
+ AddLine("STS.U32 {}, shared_mem[{}];", Visit(src), Visit(smem->GetAddress()));
+ ResetTemporaries();
+ return {};
+ } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
+ const std::string temporary = AllocTemporary();
+ AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem->GetRealAddress()),
+ Visit(gmem->GetBaseAddress()));
+ AddLine("STB.U32 {}, {}[{}];", Visit(src), GlobalMemoryName(gmem->GetDescriptor()),
+ temporary);
+ ResetTemporaries();
+ return {};
+ } else {
+ UNREACHABLE();
+ ResetTemporaries();
+ return {};
+ }
+
+ AddLine("MOV.U {}, {};", dest_name, Visit(src));
+ ResetTemporaries();
+ return {};
+}
+
+std::string ARBDecompiler::Select(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("CMP.S {}, {}, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]),
+ Visit(operation[2]));
+ return temporary;
+}
+
+std::string ARBDecompiler::FClamp(Operation operation) {
+ // 1.0f in hex, replace with std::bit_cast on C++20
+ static constexpr u32 POSITIVE_ONE = 0x3f800000;
+
+ std::string temporary = AllocTemporary();
+ const Node& value = operation[0];
+ const Node& low = operation[1];
+ const Node& high = operation[2];
+ const auto* const imm_low = std::get_if<ImmediateNode>(&*low);
+ const auto* const imm_high = std::get_if<ImmediateNode>(&*high);
+ if (imm_low && imm_high && imm_low->GetValue() == 0 && imm_high->GetValue() == POSITIVE_ONE) {
+ AddLine("MOV.F32.SAT {}, {};", temporary, Visit(value));
+ } else {
+ AddLine("MIN.F {}, {}, {};", temporary, Visit(value), Visit(high));
+ AddLine("MAX.F {}, {}, {};", temporary, temporary, Visit(low));
+ }
+ return temporary;
+}
+
+std::string ARBDecompiler::FCastHalf0(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.x, {};", temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FCastHalf1(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.y, {};", temporary, Visit(operation[0]));
+ AddLine("MOV {}.x, {}.y;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::FSqrt(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("RSQ.F32 {}, {};", temporary, Visit(operation[0]));
+ AddLine("RCP.F32 {}, {};", temporary, temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::FSwizzleAdd(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ if (!device.HasWarpIntrinsics()) {
+ LOG_ERROR(Render_OpenGL,
+ "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+ AddLine("ADD.F {}.x, {}, {};", temporary, Visit(operation[0]), Visit(operation[1]));
+ return fmt::format("{}.x", temporary);
+ }
+
+ AddLine("AND.U {}.z, {}.threadid, 3;", temporary, StageInputName(stage));
+ AddLine("SHL.U {}.z, {}.z, 1;", temporary, temporary);
+ AddLine("SHR.U {}.z, {}, {}.z;", temporary, Visit(operation[2]), temporary);
+ AddLine("AND.U {}.z, {}.z, 3;", temporary, temporary);
+ AddLine("MUL.F32 {}.x, {}, FSWZA[{}.z];", temporary, Visit(operation[0]), temporary);
+ AddLine("MUL.F32 {}.y, {}, FSWZB[{}.z];", temporary, Visit(operation[1]), temporary);
+ AddLine("ADD.F32 {}.x, {}.x, {}.y;", temporary, temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HAdd2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("ADD.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HMul2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("MUL.F16 {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HFma2(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ const std::string tmp3 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("UP2H.F {}.xy, {};", tmp2, Visit(operation[1]));
+ AddLine("UP2H.F {}.xy, {};", tmp3, Visit(operation[2]));
+ AddLine("MAD.F16 {}, {}, {}, {};", tmp1, tmp1, tmp2, tmp3);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HAbsolute(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("PK2H.F {}.x, |{}|;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HNegate(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("MOVC.S RC.x, {};", Visit(operation[1]));
+ AddLine("MOV.F {}.x (NE.x), -{}.x;", temporary, temporary);
+ AddLine("MOVC.S RC.x, {};", Visit(operation[2]));
+ AddLine("MOV.F {}.y (NE.x), -{}.y;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HClamp(Operation operation) {
+ const std::string tmp1 = AllocVectorTemporary();
+ const std::string tmp2 = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", tmp1, Visit(operation[0]));
+ AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[1]));
+ AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+ AddLine("MAX.F {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("MOV.U {}.x, {};", tmp2, Visit(operation[2]));
+ AddLine("MOV.U {}.y, {}.x;", tmp2, tmp2);
+ AddLine("MIN.F {}, {}, {};", tmp1, tmp1, tmp2);
+ AddLine("PK2H.F {}.x, {};", tmp1, tmp1);
+ return fmt::format("{}.x", tmp1);
+}
+
+std::string ARBDecompiler::HCastFloat(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.F {}.y, {{0, 0, 0, 0}};", temporary);
+ AddLine("MOV.F {}.x, {};", temporary, Visit(operation[0]));
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HUnpack(Operation operation) {
+ const std::string operand = Visit(operation[0]);
+ switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+ case Tegra::Shader::HalfType::H0_H1:
+ return operand;
+ case Tegra::Shader::HalfType::F32: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.U {}.x, {};", temporary, operand);
+ AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ case Tegra::Shader::HalfType::H0_H0: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, operand);
+ AddLine("MOV.U {}.y, {}.x;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ case Tegra::Shader::HalfType::H1_H1: {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, operand);
+ AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+ }
+ }
+ UNREACHABLE();
+ return "{0, 0, 0, 0}.x";
+}
+
+std::string ARBDecompiler::HMergeF32(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH0(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+ AddLine("MOV.U {}.x, {}.z;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HMergeH1(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("UP2H.F {}.xy, {};", temporary, Visit(operation[0]));
+ AddLine("UP2H.F {}.zw, {};", temporary, Visit(operation[1]));
+ AddLine("MOV.U {}.y, {}.w;", temporary, temporary);
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::HPack2(Operation operation) {
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("MOV.U {}.x, {};", temporary, Visit(operation[0]));
+ AddLine("MOV.U {}.y, {};", temporary, Visit(operation[1]));
+ AddLine("PK2H.F {}.x, {};", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::LogicalAssign(Operation operation) {
+ const Node& dest = operation[0];
+ const Node& src = operation[1];
+
+ std::string target;
+
+ if (const auto pred = std::get_if<PredicateNode>(&*dest)) {
+ ASSERT_MSG(!pred->IsNegated(), "Negating logical assignment");
+
+ const Tegra::Shader::Pred index = pred->GetIndex();
+ switch (index) {
+ case Tegra::Shader::Pred::NeverExecute:
+ case Tegra::Shader::Pred::UnusedIndex:
+ // Writing to these predicates is a no-op
+ return {};
+ }
+ target = fmt::format("P{}.x", static_cast<u64>(index));
+ } else if (const auto internal_flag = std::get_if<InternalFlagNode>(&*dest)) {
+ const std::size_t index = static_cast<std::size_t>(internal_flag->GetFlag());
+ target = fmt::format("{}.x", INTERNAL_FLAG_NAMES[index]);
+ } else {
+ UNREACHABLE();
+ ResetTemporaries();
+ return {};
+ }
+
+ AddLine("MOV.U {}, {};", target, Visit(src));
+ ResetTemporaries();
+ return {};
+}
+
+std::string ARBDecompiler::LogicalPick2(Operation operation) {
+ std::string temporary = AllocTemporary();
+ const u32 index = std::get<ImmediateNode>(*operation[1]).GetValue();
+ AddLine("MOV.U {}, {}.{};", temporary, Visit(operation[0]), Swizzle(index));
+ return temporary;
+}
+
+std::string ARBDecompiler::LogicalAnd2(Operation operation) {
+ std::string temporary = AllocTemporary();
+ const std::string op = Visit(operation[0]);
+ AddLine("AND.U {}, {}.x, {}.y;", temporary, op, op);
+ return temporary;
+}
+
+std::string ARBDecompiler::FloatOrdered(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+ AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+ AddLine("MOV.S {}, -1;", temporary);
+ AddLine("MOV.S {} (NAN.x), 0;", temporary);
+ AddLine("MOV.S {} (NAN.y), 0;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::FloatUnordered(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("MOVC.F32 RC.x, {};", Visit(operation[0]));
+ AddLine("MOVC.F32 RC.y, {};", Visit(operation[1]));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("MOV.S {} (NAN.x), -1;", temporary);
+ AddLine("MOV.S {} (NAN.y), -1;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
+ std::string temporary = AllocTemporary();
+ AddLine("ADDC.U RC, {}, {};", Visit(operation[0]), Visit(operation[1]));
+ AddLine("MOV.S {}, 0;", temporary);
+ AddLine("IF CF.x;");
+ AddLine("MOV.S {}, -1;", temporary);
+ AddLine("ENDIF;");
+ return temporary;
+}
+
+std::string ARBDecompiler::Texture(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [temporary, swizzle] = BuildCoords(operation);
+
+ std::string_view opcode = "TEX";
+ std::string extra;
+ if (meta.bias) {
+ ASSERT(!meta.lod);
+ opcode = "TXB";
+
+ if (swizzle < 4) {
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.bias));
+ } else {
+ const std::string bias = AllocTemporary();
+ AddLine("MOV.F {}, {};", bias, Visit(meta.bias));
+ extra = fmt::format(" {},", bias);
+ }
+ }
+ if (meta.lod) {
+ ASSERT(!meta.bias);
+ opcode = "TXL";
+
+ if (swizzle < 4) {
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+ } else {
+ const std::string lod = AllocTemporary();
+ AddLine("MOV.F {}, {};", lod, Visit(meta.lod));
+ extra = fmt::format(" {},", lod);
+ }
+ }
+
+ AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGather(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [temporary, swizzle] = BuildCoords(operation);
+
+ std::string comp;
+ if (!meta.sampler.is_shadow) {
+ const auto& immediate = std::get<ImmediateNode>(*meta.component);
+ comp = fmt::format(".{}", Swizzle(immediate.GetValue()));
+ }
+
+ AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryDimensions(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const std::string temporary = AllocVectorTemporary();
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+ ASSERT(!meta.sampler.is_array);
+
+ const std::string lod = operation.GetOperandsCount() > 0 ? Visit(operation[0]) : "0";
+ AddLine("TXQ {}, {}, texture[{}], {};", temporary, lod, sampler_id, TextureType(meta));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureQueryLod(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const std::string temporary = AllocVectorTemporary();
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+
+ ASSERT(!meta.sampler.is_array);
+
+ const std::size_t count = operation.GetOperandsCount();
+ for (std::size_t i = 0; i < count; ++i) {
+ AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ AddLine("LOD.F {}, {}, texture[{}], {};", temporary, temporary, sampler_id, TextureType(meta));
+ AddLine("MUL.F32 {}, {}, {{256, 256, 0, 0}};", temporary, temporary);
+ AddLine("TRUNC.S {}, {};", temporary, temporary);
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TexelFetch(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const auto [temporary, swizzle] = BuildCoords(operation);
+
+ if (!meta.sampler.is_buffer) {
+ ASSERT(swizzle < 4);
+ AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
+ }
+ AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+ BuildAoffi(operation));
+ AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::TextureGradient(Operation operation) {
+ const auto& meta = std::get<MetaTexture>(operation.GetMeta());
+ const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
+ const std::string ddx = AllocVectorTemporary();
+ const std::string ddy = AllocVectorTemporary();
+ const std::string coord = BuildCoords(operation).first;
+
+ const std::size_t num_components = meta.derivates.size() / 2;
+ for (std::size_t index = 0; index < num_components; ++index) {
+ const char swizzle = Swizzle(index);
+ AddLine("MOV.F {}.{}, {};", ddx, swizzle, Visit(meta.derivates[index * 2]));
+ AddLine("MOV.F {}.{}, {};", ddy, swizzle, Visit(meta.derivates[index * 2 + 1]));
+ }
+
+ const std::string_view result = coord;
+ AddLine("TXD.F {}, {}, {}, {}, texture[{}], {}{};", result, coord, ddx, ddy, sampler_id,
+ TextureType(meta), BuildAoffi(operation));
+ AddLine("MOV.F {}.x, {}.{};", result, result, Swizzle(meta.element));
+ return fmt::format("{}.x", result);
+}
+
+std::string ARBDecompiler::ImageLoad(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t count = operation.GetOperandsCount();
+ const std::string_view type = ImageType(meta.image.type);
+
+ const std::string temporary = AllocVectorTemporary();
+ for (std::size_t i = 0; i < count; ++i) {
+ AddLine("MOV.S {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
+ }
+ AddLine("LOADIM.F {}, {}, image[{}], {};", temporary, temporary, image_id, type);
+ AddLine("MOV.F {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::ImageStore(Operation operation) {
+ const auto& meta = std::get<MetaImage>(operation.GetMeta());
+ const u32 image_id = device.GetBaseBindings(stage).image + meta.image.index;
+ const std::size_t num_coords = operation.GetOperandsCount();
+ const std::size_t num_values = meta.values.size();
+ const std::string_view type = ImageType(meta.image.type);
+
+ const std::string coord = AllocVectorTemporary();
+ const std::string value = AllocVectorTemporary();
+ for (std::size_t i = 0; i < num_coords; ++i) {
+ AddLine("MOV.S {}.{}, {};", coord, Swizzle(i), Visit(operation[i]));
+ }
+ for (std::size_t i = 0; i < num_values; ++i) {
+ AddLine("MOV.F {}.{}, {};", value, Swizzle(i), Visit(meta.values[i]));
+ }
+ AddLine("STOREIM.F image[{}], {}, {}, {};", image_id, value, coord, type);
+ return {};
+}
+
+std::string ARBDecompiler::Branch(Operation operation) {
+ const auto target = std::get<ImmediateNode>(*operation[0]);
+ AddLine("MOV.U PC.x, {};", target.GetValue());
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::BranchIndirect(Operation operation) {
+ AddLine("MOV.U PC.x, {};", Visit(operation[0]));
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::PushFlowStack(Operation operation) {
+ const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+ const u32 target = std::get<ImmediateNode>(*operation[0]).GetValue();
+ const std::string_view stack_name = StackName(stack);
+ AddLine("MOV.U {}[{}_TOP.x].x, {};", stack_name, stack_name, target);
+ AddLine("ADD.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+ return {};
+}
+
+std::string ARBDecompiler::PopFlowStack(Operation operation) {
+ const auto stack = std::get<MetaStackClass>(operation.GetMeta());
+ const std::string_view stack_name = StackName(stack);
+ AddLine("SUB.S {}_TOP.x, {}_TOP.x, 1;", stack_name, stack_name);
+ AddLine("MOV.U PC.x, {}[{}_TOP.x].x;", stack_name, stack_name);
+ AddLine("CONT;");
+ return {};
+}
+
+std::string ARBDecompiler::Exit(Operation) {
+ Exit();
+ return {};
+}
+
+std::string ARBDecompiler::Discard(Operation) {
+ AddLine("KIL TR;");
+ return {};
+}
+
+std::string ARBDecompiler::EmitVertex(Operation) {
+ AddLine("EMIT;");
+ return {};
+}
+
+std::string ARBDecompiler::EndPrimitive(Operation) {
+ AddLine("ENDPRIM;");
+ return {};
+}
+
+std::string ARBDecompiler::InvocationId(Operation) {
+ return "primitive.invocation";
+}
+
+std::string ARBDecompiler::YNegate(Operation) {
+ LOG_WARNING(Render_OpenGL, "(STUBBED)");
+ const std::string temporary = AllocTemporary();
+ AddLine("MOV.F {}, 1;", temporary);
+ return temporary;
+}
+
+std::string ARBDecompiler::ThreadId(Operation) {
+ return fmt::format("{}.threadid", StageInputName(stage));
+}
+
+std::string ARBDecompiler::ShuffleIndexed(Operation operation) {
+ if (!device.HasWarpIntrinsics()) {
+ LOG_ERROR(Render_OpenGL,
+ "NV_shader_thread_shuffle is missing. Kepler or better is required.");
+ return Visit(operation[0]);
+ }
+ const std::string temporary = AllocVectorTemporary();
+ AddLine("SHFIDX.U {}, {}, {}, {{31, 0, 0, 0}};", temporary, Visit(operation[0]),
+ Visit(operation[1]));
+ AddLine("MOV.U {}.x, {}.y;", temporary, temporary);
+ return fmt::format("{}.x", temporary);
+}
+
+std::string ARBDecompiler::Barrier(Operation) {
+ if (!ir.IsDecompiled()) {
+ LOG_ERROR(Render_OpenGL, "BAR used but shader is not decompiled");
+ return {};
+ }
+ AddLine("BAR;");
+ return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGroup(Operation) {
+ AddLine("MEMBAR.CTA;");
+ return {};
+}
+
+std::string ARBDecompiler::MemoryBarrierGlobal(Operation) {
+ AddLine("MEMBAR;");
+ return {};
+}
+
+} // Anonymous namespace
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ const VideoCommon::Shader::Registry& registry,
+ Tegra::Engines::ShaderType stage, std::string_view identifier) {
+ return ARBDecompiler(device, ir, registry, stage, identifier).Code();
+}
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.h b/src/video_core/renderer_opengl/gl_arb_decompiler.h
new file mode 100644
index 000000000..6afc87220
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+ const VideoCommon::Shader::Registry& registry,
+ Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 9964ea894..d9f7b4cc6 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -22,22 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
: VideoCommon::BufferBlock{cpu_addr, size} {
gl_buffer.Create();
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+ glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+ }
}
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+ glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+ data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+ MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+ glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+ glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+ data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size) const {
+ glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+ static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}
OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
- const Device& device, std::size_t stream_size)
- : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+ const Device& device_, std::size_t stream_size)
+ : GenericBufferCache{rasterizer, system,
+ std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+ device{device_} {
if (!device.HasFastBufferSubData()) {
return;
}
- static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+ static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
for (const GLuint cbuf : cbufs) {
glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -48,44 +72,21 @@ OGLBufferCache::~OGLBufferCache() {
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
}
-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
- return std::make_shared<CachedBufferBlock>(cpu_addr, size);
-}
-
-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
- return buffer->GetHandle();
-}
-
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
- return 0;
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+ return std::make_shared<Buffer>(device, cpu_addr, size);
}
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) {
- glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) {
- MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
- glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
- glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
- static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) {
- glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
- static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+ return {0, 0, 0};
}
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
std::size_t size) {
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
- const GLuint& cbuf = cbufs[cbuf_cursor++];
+ const GLuint cbuf = cbufs[cbuf_cursor++];
+
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
- return {cbuf, 0};
+ return {cbuf, 0, 0};
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a9e86cfc7..59d95adbc 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -24,57 +23,57 @@ class Device;
class OGLStreamBuffer;
class RasterizerOpenGL;
-class CachedBufferBlock;
+class Buffer : public VideoCommon::BufferBlock {
+public:
+ explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+ ~Buffer();
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
+ void Upload(std::size_t offset, std::size_t size, const u8* data) const;
-class CachedBufferBlock : public VideoCommon::BufferBlock {
-public:
- explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
- ~CachedBufferBlock();
+ void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+ void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size) const;
- GLuint GetHandle() const {
+ GLuint Handle() const noexcept {
return gl_buffer.handle;
}
+ u64 Address() const noexcept {
+ return gpu_address;
+ }
+
private:
OGLBuffer gl_buffer;
+ u64 gpu_address = 0;
};
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
class OGLBufferCache final : public GenericBufferCache {
public:
explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
const Device& device, std::size_t stream_size);
~OGLBufferCache();
- GLuint GetEmptyBuffer(std::size_t) override;
+ BufferInfo GetEmptyBuffer(std::size_t) override;
void Acquire() noexcept {
cbuf_cursor = 0;
}
protected:
- Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
- GLuint ToHandle(const Buffer& buffer) override;
-
- void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) override;
-
- void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) override;
-
- void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) override;
+ std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
private:
+ static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+ Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+ const Device& device;
+
std::size_t cbuf_cursor = 0;
- std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
- Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
- cbufs;
+ std::array<GLuint, NUM_CBUFS> cbufs{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index a14641b97..208fc6167 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -123,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
u32 base_images = 0;
- // Reserve more image bindings on fragment and vertex stages.
+ // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+ // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+ // fragment stage, and at least 1 for the rest of the stages.
+ // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+ // Reserve at least 4 image bindings on the fragment stage.
bindings[4].image =
- Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
- bindings[0].image =
- Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+ Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+ // This is guaranteed to be at least 1.
+ const u32 total_extracted_images = num_images / (NumStages - 1);
// Reserve the other image bindings.
- const u32 total_extracted_images = num_images / (NumStages - 2);
- for (std::size_t i = 2; i < NumStages; ++i) {
+ for (std::size_t i = 0; i < NumStages; ++i) {
const std::size_t stage = stage_swizzle[i];
+ if (stage == 4) {
+ continue;
+ }
bindings[stage].image =
Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
}
@@ -170,7 +178,7 @@ bool IsASTCSupported() {
for (const GLenum format : formats) {
for (const GLenum support : required_support) {
GLint value;
- glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value);
+ glGetInternalformativ(target, format, support, 1, &value);
if (value != GL_FULL_SUPPORT) {
return false;
}
@@ -185,6 +193,7 @@ bool IsASTCSupported() {
Device::Device()
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+ const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions();
@@ -208,13 +217,21 @@ Device::Device()
has_shader_ballot = GLAD_GL_ARB_shader_ballot;
has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+ has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
has_astc = IsASTCSupported();
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
+ has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+ has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory;
+
+ // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+ // uniform buffers as "push constants"
has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
- GLAD_GL_NV_compute_program5;
+ GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
+ GLAD_GL_NV_transform_feedback2;
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
@@ -235,6 +252,7 @@ Device::Device(std::nullptr_t) {
has_shader_ballot = true;
has_vertex_viewport_layer = true;
has_image_load_formatted = true;
+ has_texture_shadow_lod = true;
has_variable_aoffi = true;
}
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index 98cca0254..e1d811966 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -68,6 +68,14 @@ public:
return has_image_load_formatted;
}
+ bool HasTextureShadowLod() const {
+ return has_texture_shadow_lod;
+ }
+
+ bool HasVertexBufferUnifiedMemory() const {
+ return has_vertex_buffer_unified_memory;
+ }
+
bool HasASTC() const {
return has_astc;
}
@@ -88,6 +96,10 @@ public:
return has_fast_buffer_sub_data;
}
+ bool HasNvViewportArray2() const {
+ return has_nv_viewport_array2;
+ }
+
bool UseAssemblyShaders() const {
return use_assembly_shaders;
}
@@ -106,11 +118,14 @@ private:
bool has_shader_ballot{};
bool has_vertex_viewport_layer{};
bool has_image_load_formatted{};
+ bool has_texture_shadow_lod{};
+ bool has_vertex_buffer_unified_memory{};
bool has_astc{};
bool has_variable_aoffi{};
bool has_component_indexing_bug{};
bool has_precise_bug{};
bool has_fast_buffer_sub_data{};
+ bool has_nv_viewport_array2{};
bool use_assembly_shaders{};
};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 55e79aaf6..e960a0ef1 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/maxwell_to_gl.h"
#include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"
namespace OpenGL {
@@ -60,15 +61,28 @@ constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
template <typename Engine, typename Entry>
Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
ShaderType shader_type, std::size_t index = 0) {
+ if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+ if (entry.is_separated) {
+ const u32 buffer_1 = entry.buffer;
+ const u32 buffer_2 = entry.secondary_buffer;
+ const u32 offset_1 = entry.offset;
+ const u32 offset_2 = entry.secondary_offset;
+ const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+ const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+ return engine.GetTextureInfo(handle_1 | handle_2);
+ }
+ }
if (entry.is_bindless) {
- const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
- return engine.GetTextureInfo(tex_handle);
+ const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+ return engine.GetTextureInfo(handle);
}
+
const auto& gpu_profile = engine.AccessGuestDriverProfile();
const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -93,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
return buffer.size;
}
+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+ const u8 index = location / 4;
+ if (index >= 8 && index <= 39) {
+ return {GL_GENERIC_ATTRIB_NV, index - 8};
+ }
+ if (index >= 48 && index <= 55) {
+ return {GL_TEXTURE_COORD_NV, index - 48};
+ }
+ switch (index) {
+ case 7:
+ return {GL_POSITION, 0};
+ case 40:
+ return {GL_PRIMARY_COLOR_NV, 0};
+ case 41:
+ return {GL_SECONDARY_COLOR_NV, 0};
+ case 42:
+ return {GL_BACK_PRIMARY_COLOR_NV, 0};
+ case 43:
+ return {GL_BACK_SECONDARY_COLOR_NV, 0};
+ }
+ UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+ return {GL_POSITION, 0};
+}
+
void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap);
}
@@ -152,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
// avoid OpenGL errors.
// TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
// assume every shader uses them all.
- for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
if (!flags[Dirty::VertexFormat0 + index]) {
continue;
}
@@ -171,9 +213,10 @@ void RasterizerOpenGL::SetupVertexFormat() {
if (attrib.type == Maxwell::VertexAttribute::Type::SignedInt ||
attrib.type == Maxwell::VertexAttribute::Type::UnsignedInt) {
glVertexAttribIFormat(gl_index, attrib.ComponentCount(),
- MaxwellToGL::VertexType(attrib), attrib.offset);
+ MaxwellToGL::VertexFormat(attrib), attrib.offset);
} else {
- glVertexAttribFormat(gl_index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
+ glVertexAttribFormat(gl_index, attrib.ComponentCount(),
+ MaxwellToGL::VertexFormat(attrib),
attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
}
glVertexAttribBinding(gl_index, attrib.buffer);
@@ -190,9 +233,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {
MICROPROFILE_SCOPE(OpenGL_VB);
+ const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
// Upload all guest vertex arrays sequentially to our buffer
const auto& regs = gpu.regs;
- for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
if (!flags[Dirty::VertexBuffer0 + index]) {
continue;
}
@@ -205,16 +250,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {
const GPUVAddr start = vertex_array.StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
ASSERT(end >= start);
+
+ const GLuint gl_index = static_cast<GLuint>(index);
const u64 size = end - start;
if (size == 0) {
- glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+ glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+ if (use_unified_memory) {
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+ }
continue;
}
- const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
- glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
- vertex_array.stride);
+ const auto info = buffer_cache.UploadMemory(start, size);
+ if (use_unified_memory) {
+ glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+ info.address + info.offset, size);
+ } else {
+ glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+ }
}
}
@@ -227,7 +281,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
flags[Dirty::VertexInstances] = false;
const auto& regs = gpu.regs;
- for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+ for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
if (!flags[Dirty::VertexInstance0 + index]) {
continue;
}
@@ -244,9 +298,9 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
MICROPROFILE_SCOPE(OpenGL_Index);
const auto& regs = system.GPU().Maxwell3D().regs;
const std::size_t size = CalculateIndexBufferSize();
- const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
- glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
- return offset;
+ const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+ glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+ return info.offset;
}
void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
@@ -282,7 +336,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
continue;
}
- Shader shader{shader_cache.GetStageProgram(program)};
+ Shader* const shader = shader_cache.GetStageProgram(program);
if (device.UseAssemblyShaders()) {
// Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
@@ -576,7 +630,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
// Prepare the vertex array.
- buffer_cache.Map(buffer_size);
+ const bool invalidated = buffer_cache.Map(buffer_size);
+
+ if (invalidated) {
+ // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+ auto& dirty = gpu.dirty.flags;
+ dirty[Dirty::VertexBuffers] = true;
+ for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+ dirty[index] = true;
+ }
+ }
// Prepare vertex array format.
SetupVertexFormat();
@@ -593,9 +656,9 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
if (!device.UseAssemblyShaders()) {
MaxwellUniformData ubo;
ubo.SetFromRegs(gpu);
- const auto [buffer, offset] =
+ const auto info =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
- glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
+ glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
static_cast<GLsizeiptr>(sizeof(ubo)));
}
@@ -842,7 +905,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
return true;
}
-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
static constexpr std::array PARAMETER_LUT = {
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
@@ -872,7 +935,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
}
}
-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
const auto& entries = kernel->GetEntries();
@@ -906,8 +969,7 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
if (device.UseAssemblyShaders()) {
glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
} else {
- glBindBufferRange(GL_UNIFORM_BUFFER, binding,
- buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
}
return;
}
@@ -920,28 +982,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
const GPUVAddr gpu_addr = buffer.address;
- auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+ auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
if (device.UseAssemblyShaders()) {
UNIMPLEMENTED_IF(use_unified);
- if (offset != 0) {
+ if (info.offset != 0) {
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
- glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size);
- cbuf = staging_cbuf;
- offset = 0;
+ glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+ info.handle = staging_cbuf;
+ info.offset = 0;
}
- glBindBufferRangeNV(stage, binding, cbuf, offset, size);
+ glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
return;
}
if (use_unified) {
- glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size);
+ glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+ unified_offset, size);
} else {
- glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+ glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
}
}
-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};
@@ -956,7 +1019,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
}
}
-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
auto& gpu{system.GPU()};
auto& memory_manager{gpu.MemoryManager()};
const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -973,13 +1036,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, std::size_t size) {
const auto alignment{device.GetShaderStorageBufferAlignment()};
- const auto [ssbo, buffer_offset] =
- buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
- glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+ const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+ glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
static_cast<GLsizeiptr>(size));
}
-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
MICROPROFILE_SCOPE(OpenGL_Texture);
const auto& maxwell3d = system.GPU().Maxwell3D();
u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -992,7 +1054,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
}
}
-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_Texture);
const auto& compute = system.GPU().KeplerCompute();
u32 binding = 0;
@@ -1021,7 +1083,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
}
}
-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
const auto& maxwell3d = system.GPU().Maxwell3D();
u32 binding = device.GetBaseBindings(stage_index).image;
for (const auto& entry : shader->GetEntries().images) {
@@ -1031,7 +1093,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
}
}
-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
const auto& compute = system.GPU().KeplerCompute();
u32 binding = 0;
for (const auto& entry : shader->GetEntries().images) {
@@ -1547,12 +1609,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
}
+void RasterizerOpenGL::SyncTransformFeedback() {
+ // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+ // when this is required.
+ const auto& regs = system.GPU().Maxwell3D().regs;
+
+ static constexpr std::size_t STRIDE = 3;
+ std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+ std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+ GLint* cursor = attribs.data();
+ GLint* current_stream = streams.data();
+
+ for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+ const auto& layout = regs.tfb_layouts[feedback];
+ UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+ if (layout.varying_count == 0) {
+ continue;
+ }
+
+ *current_stream = static_cast<GLint>(feedback);
+ if (current_stream != streams.data()) {
+ // When stepping one stream, push the expected token
+ cursor[0] = GL_NEXT_BUFFER_NV;
+ cursor[1] = 0;
+ cursor[2] = 0;
+ cursor += STRIDE;
+ }
+ ++current_stream;
+
+ const auto& locations = regs.tfb_varying_locs[feedback];
+ std::optional<u8> current_index;
+ for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+ const u8 location = locations[offset];
+ const u8 index = location / 4;
+
+ if (current_index == index) {
+ // Increase number of components of the previous attachment
+ ++cursor[-2];
+ continue;
+ }
+ current_index = index;
+
+ std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+ cursor[1] = 1;
+ cursor += STRIDE;
+ }
+ }
+
+ const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+ const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+ glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+ GL_INTERLEAVED_ATTRIBS);
+}
+
void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
const auto& regs = system.GPU().Maxwell3D().regs;
if (regs.tfb_enabled == 0) {
return;
}
+ if (device.UseAssemblyShaders()) {
+ SyncTransformFeedback();
+ }
+
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1579,6 +1699,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
static_cast<GLsizeiptr>(size));
}
+ // We may have to call BeginTransformFeedbackNV here since they seem to call different
+ // implementations on Nvidia's driver (the pointer is different) but we are using
+ // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+ // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
glBeginTransformFeedback(GL_POINTS);
}
@@ -1600,8 +1724,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
const GLuint handle = transform_feedback_buffers[index].handle;
const GPUVAddr gpu_addr = binding.Address();
const std::size_t size = binding.buffer_size;
- const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
- glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+ const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+ glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+ static_cast<GLsizeiptr>(size));
}
}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index f5dc56a0e..4f082592f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
#include "video_core/engines/const_buffer_info.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h"
@@ -100,10 +99,10 @@ private:
void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);
/// Configures the current constbuffers to use for the draw command.
- void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+ void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
/// Configures the current constbuffers to use for the kernel invocation.
- void SetupComputeConstBuffers(const Shader& kernel);
+ void SetupComputeConstBuffers(Shader* kernel);
/// Configures a constant buffer.
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
@@ -111,30 +110,30 @@ private:
std::size_t unified_offset);
/// Configures the current global memory entries to use for the draw command.
- void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+ void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
/// Configures the current global memory entries to use for the kernel invocation.
- void SetupComputeGlobalMemory(const Shader& kernel);
+ void SetupComputeGlobalMemory(Shader* kernel);
/// Configures a constant buffer.
void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
std::size_t size);
/// Configures the current textures to use for the draw command.
- void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+ void SetupDrawTextures(std::size_t stage_index, Shader* shader);
/// Configures the textures used in a compute shader.
- void SetupComputeTextures(const Shader& kernel);
+ void SetupComputeTextures(Shader* kernel);
/// Configures a texture.
void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
const SamplerEntry& entry);
/// Configures images in a graphics shader.
- void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+ void SetupDrawImages(std::size_t stage_index, Shader* shader);
/// Configures images in a compute shader.
- void SetupComputeImages(const Shader& shader);
+ void SetupComputeImages(Shader* shader);
/// Configures an image.
void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -202,6 +201,10 @@ private:
/// Syncs the framebuffer sRGB state to match the guest state
void SyncFramebufferSRGB();
+ /// Syncs transform feedback state to match guest state
+ /// @note Only valid on assembly shaders
+ void SyncTransformFeedback();
+
/// Begin a transform feedback
void BeginTransformFeedback(GLenum primitive_mode);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index a991ca64a..c6a3bf3a1 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_shader_cache.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
#include "video_core/shader/memory_util.h"
#include "video_core/shader/registry.h"
#include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
namespace OpenGL {
@@ -147,7 +149,8 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u
auto program = std::make_shared<ProgramHandle>();
if (device.UseAssemblyShaders()) {
- const std::string arb = "Not implemented";
+ const std::string arb =
+ DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
GLuint& arb_prog = program->assembly_program.handle;
@@ -194,12 +197,9 @@ std::unordered_set<GLenum> GetSupportedFormats() {
} // Anonymous namespace
-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
- std::shared_ptr<VideoCommon::Shader::Registry> registry,
- ShaderEntries entries, ProgramSharedPtr program_)
- : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
- size_in_bytes{size_in_bytes}, program{std::move(program_)} {
- // Assign either the assembly program or source program. We can't have both.
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+ ProgramSharedPtr program_)
+ : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
handle = program->assembly_program.handle;
if (handle == 0) {
handle = program->source_program.handle;
@@ -207,16 +207,16 @@ CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
ASSERT(handle != 0);
}
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
-GLuint CachedShader::GetHandle() const {
+GLuint Shader::GetHandle() const {
DEBUG_ASSERT(registry->IsConsistent());
return handle;
}
-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
- Maxwell::ShaderProgram program_type, ProgramCode code,
- ProgramCode code_b) {
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
+ Maxwell::ShaderProgram program_type,
+ ProgramCode code, ProgramCode code_b) {
const auto shader_type = GetShaderType(program_type);
const std::size_t size_in_bytes = code.size() * sizeof(u64);
@@ -241,12 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
- return std::shared_ptr<CachedShader>(
- new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
- MakeEntries(params.device, ir, shader_type), std::move(program)));
+ return std::unique_ptr<Shader>(new Shader(
+ std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
}
-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+ ProgramCode code) {
const std::size_t size_in_bytes = code.size() * sizeof(u64);
auto& engine = params.system.GPU().KeplerCompute();
@@ -266,23 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
entry.bindless_samplers = registry->GetBindlessSamplers();
params.disk_cache.SaveEntry(std::move(entry));
- return std::shared_ptr<CachedShader>(
- new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry),
- MakeEntries(params.device, ir, ShaderType::Compute), std::move(program)));
+ return std::unique_ptr<Shader>(new Shader(std::move(registry),
+ MakeEntries(params.device, ir, ShaderType::Compute),
+ std::move(program)));
}
-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
- const PrecompiledShader& precompiled_shader,
- std::size_t size_in_bytes) {
- return std::shared_ptr<CachedShader>(
- new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
- precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+ const PrecompiledShader& precompiled_shader) {
+ return std::unique_ptr<Shader>(new Shader(
+ precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
}
ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
Core::Frontend::EmuWindow& emu_window, const Device& device)
- : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
- disk_cache{system} {}
+ : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
+ emu_window{emu_window}, device{device}, disk_cache{system} {}
+
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;
void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) {
@@ -436,7 +436,7 @@ ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
return program;
}
-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
return last_shaders[static_cast<std::size_t>(program)];
}
@@ -446,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
// Look up shader in the cache based on address
const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
- Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader};
- if (shader) {
+ if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
return last_shaders[static_cast<std::size_t>(program)] = shader;
}
@@ -461,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
const u8* host_ptr_b = memory_manager.GetPointer(address_b);
code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
}
+ const std::size_t code_size = code.size() * sizeof(u64);
- const auto unique_identifier = GetUniqueIdentifier(
+ const u64 unique_identifier = GetUniqueIdentifier(
GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
const ShaderParameters params{system, disk_cache, device,
*cpu_addr, host_ptr, unique_identifier};
+ std::unique_ptr<Shader> shader;
const auto found = runtime_cache.find(unique_identifier);
if (found == runtime_cache.end()) {
- shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
- std::move(code_b));
+ shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
} else {
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
- shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+ shader = Shader::CreateFromCache(params, found->second);
}
+ Shader* const result = shader.get();
if (cpu_addr) {
- Register(shader);
+ Register(std::move(shader), *cpu_addr, code_size);
} else {
- null_shader = shader;
+ null_shader = std::move(shader);
}
- return last_shaders[static_cast<std::size_t>(program)] = shader;
+ return last_shaders[static_cast<std::size_t>(program)] = result;
}
-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
auto& memory_manager{system.GPU().MemoryManager()};
const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};
- auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
- if (kernel) {
+ if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
return kernel;
}
const auto host_ptr{memory_manager.GetPointer(code_addr)};
// No kernel found, create a new one
- auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
- const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+ ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+ const std::size_t code_size{code.size() * sizeof(u64)};
+ const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
const ShaderParameters params{system, disk_cache, device,
*cpu_addr, host_ptr, unique_identifier};
+ std::unique_ptr<Shader> kernel;
const auto found = runtime_cache.find(unique_identifier);
if (found == runtime_cache.end()) {
- kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+ kernel = Shader::CreateKernelFromMemory(params, std::move(code));
} else {
- const std::size_t size_in_bytes = code.size() * sizeof(u64);
- kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+ kernel = Shader::CreateFromCache(params, found->second);
}
+ Shader* const result = kernel.get();
if (cpu_addr) {
- Register(kernel);
+ Register(std::move(kernel), *cpu_addr, code_size);
} else {
- null_kernel = kernel;
+ null_kernel = std::move(kernel);
}
- return kernel;
+ return result;
}
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index b2ae8d7f9..994aaeaf2 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@
#include "common/common_types.h"
#include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_disk_cache.h"
#include "video_core/shader/registry.h"
#include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
namespace Core {
class System;
@@ -35,12 +35,9 @@ class EmuWindow;
namespace OpenGL {
-class CachedShader;
class Device;
class RasterizerOpenGL;
-struct UnspecializedShader;
-using Shader = std::shared_ptr<CachedShader>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
struct ProgramHandle {
@@ -64,62 +61,53 @@ struct ShaderParameters {
u64 unique_identifier;
};
-class CachedShader final : public RasterizerCacheObject {
+class Shader final {
public:
- ~CachedShader();
+ ~Shader();
/// Gets the GL program handle for the shader
GLuint GetHandle() const;
- /// Returns the size in bytes of the shader
- std::size_t GetSizeInBytes() const override {
- return size_in_bytes;
- }
-
/// Gets the shader entries for the shader
const ShaderEntries& GetEntries() const {
return entries;
}
- static Shader CreateStageFromMemory(const ShaderParameters& params,
- Maxwell::ShaderProgram program_type,
- ProgramCode program_code, ProgramCode program_code_b);
- static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+ static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
+ Maxwell::ShaderProgram program_type,
+ ProgramCode program_code,
+ ProgramCode program_code_b);
+ static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+ ProgramCode code);
- static Shader CreateFromCache(const ShaderParameters& params,
- const PrecompiledShader& precompiled_shader,
- std::size_t size_in_bytes);
+ static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+ const PrecompiledShader& precompiled_shader);
private:
- explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
- std::shared_ptr<VideoCommon::Shader::Registry> registry,
- ShaderEntries entries, ProgramSharedPtr program);
+ explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+ ProgramSharedPtr program);
std::shared_ptr<VideoCommon::Shader::Registry> registry;
ShaderEntries entries;
- std::size_t size_in_bytes = 0;
ProgramSharedPtr program;
GLuint handle = 0;
};
-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
public:
explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
Core::Frontend::EmuWindow& emu_window, const Device& device);
+ ~ShaderCacheOpenGL() override;
/// Loads disk cache for the current game
void LoadDiskCache(const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback);
/// Gets the current specified shader stage program
- Shader GetStageProgram(Maxwell::ShaderProgram program);
+ Shader* GetStageProgram(Maxwell::ShaderProgram program);
/// Gets a compute kernel in the passed address
- Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
- // We do not have to flush this cache as things in it are never modified by us.
- void FlushObjectInner(const Shader& object) override {}
+ Shader* GetComputeKernel(GPUVAddr code_addr);
private:
ProgramSharedPtr GeneratePrecompiledProgram(
@@ -132,10 +120,10 @@ private:
ShaderDiskCacheOpenGL disk_cache;
std::unordered_map<u64, PrecompiledShader> runtime_cache;
- Shader null_shader{};
- Shader null_kernel{};
+ std::unique_ptr<Shader> null_shader;
+ std::unique_ptr<Shader> null_kernel;
- std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+ std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
};
} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d6e30b321..2c49aeaac 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
using Tegra::Shader::IpaSampleMode;
using Tegra::Shader::PixelImap;
using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
using VideoCommon::Shader::BuildTransformFeedback;
using VideoCommon::Shader::Registry;
@@ -526,6 +527,9 @@ private:
if (device.HasImageLoadFormatted()) {
code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
}
+ if (device.HasTextureShadowLod()) {
+ code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+ }
if (device.HasWarpIntrinsics()) {
code.AddLine("#extension GL_NV_gpu_shader5 : require");
code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -909,13 +913,13 @@ private:
return "samplerBuffer";
}
switch (sampler.type) {
- case Tegra::Shader::TextureType::Texture1D:
+ case TextureType::Texture1D:
return "sampler1D";
- case Tegra::Shader::TextureType::Texture2D:
+ case TextureType::Texture2D:
return "sampler2D";
- case Tegra::Shader::TextureType::Texture3D:
+ case TextureType::Texture3D:
return "sampler3D";
- case Tegra::Shader::TextureType::TextureCube:
+ case TextureType::TextureCube:
return "samplerCube";
default:
UNREACHABLE();
@@ -1380,8 +1384,19 @@ private:
const std::size_t count = operation.GetOperandsCount();
const bool has_array = meta->sampler.is_array;
const bool has_shadow = meta->sampler.is_shadow;
+ const bool workaround_lod_array_shadow_as_grad =
+ !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+ ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+ meta->sampler.type == TextureType::TextureCube);
+
+ std::string expr = "texture";
+
+ if (workaround_lod_array_shadow_as_grad) {
+ expr += "Grad";
+ } else {
+ expr += function_suffix;
+ }
- std::string expr = "texture" + function_suffix;
if (!meta->aoffi.empty()) {
expr += "Offset";
} else if (!meta->ptp.empty()) {
@@ -1415,6 +1430,16 @@ private:
expr += ')';
}
+ if (workaround_lod_array_shadow_as_grad) {
+ switch (meta->sampler.type) {
+ case TextureType::Texture2D:
+ return expr + ", vec2(0.0), vec2(0.0))";
+ case TextureType::TextureCube:
+ return expr + ", vec3(0.0), vec3(0.0))";
+ }
+ UNREACHABLE();
+ }
+
for (const auto& variant : extras) {
if (const auto argument = std::get_if<TextureArgument>(&variant)) {
expr += GenerateTextureArgument(*argument);
@@ -2041,8 +2066,19 @@ private:
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
ASSERT(meta);
- std::string expr = GenerateTexture(
- operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+ std::string expr{};
+
+ if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+ ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+ meta->sampler.type == TextureType::TextureCube)) {
+ LOG_ERROR(Render_OpenGL,
+ "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+ expr = GenerateTexture(operation, "Lod", {});
+ } else {
+ expr = GenerateTexture(operation, "Lod",
+ {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+ }
+
if (meta->sampler.is_shadow) {
expr = "vec4(" + expr + ')';
}
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 9e95a122b..653c3f2f9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;
namespace {
+using VideoCommon::Shader::SeparateSamplerKey;
+
using ShaderCacheVersionHash = std::array<u8, 64>;
struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
u32 value = 0;
};
-struct BoundSamplerKey {
+struct BoundSamplerEntry {
u32 offset = 0;
Tegra::Engines::SamplerDescriptor sampler;
};
-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+ u32 cbuf1 = 0;
+ u32 cbuf2 = 0;
+ u32 offset1 = 0;
+ u32 offset2 = 0;
+ Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
u32 cbuf = 0;
u32 offset = 0;
Tegra::Engines::SamplerDescriptor sampler;
};
-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;
ShaderCacheVersionHash GetShaderCacheVersionHash() {
ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
u32 texture_handler_size_value;
u32 num_keys;
u32 num_bound_samplers;
+ u32 num_separate_samplers;
u32 num_bindless_samplers;
if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
file.ReadArray(&texture_handler_size_value, 1) != 1 ||
file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+ file.ReadArray(&num_separate_samplers, 1) != 1 ||
file.ReadArray(&num_bindless_samplers, 1) != 1) {
return false;
}
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
}
std::vector<ConstBufferKey> flat_keys(num_keys);
- std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
- std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+ std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+ std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+ std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
flat_bound_samplers.size() ||
+ file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+ flat_separate_samplers.size() ||
file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
flat_bindless_samplers.size()) {
return false;
}
- for (const auto& key : flat_keys) {
- keys.insert({{key.cbuf, key.offset}, key.value});
+ for (const auto& entry : flat_keys) {
+ keys.insert({{entry.cbuf, entry.offset}, entry.value});
}
- for (const auto& key : flat_bound_samplers) {
- bound_samplers.emplace(key.offset, key.sampler);
+ for (const auto& entry : flat_bound_samplers) {
+ bound_samplers.emplace(entry.offset, entry.sampler);
}
- for (const auto& key : flat_bindless_samplers) {
- bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+ for (const auto& entry : flat_separate_samplers) {
+ SeparateSamplerKey key;
+ key.buffers = {entry.cbuf1, entry.cbuf2};
+ key.offsets = {entry.offset1, entry.offset2};
+ separate_samplers.emplace(key, entry.sampler);
+ }
+ for (const auto& entry : flat_bindless_samplers) {
+ bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
}
return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+ file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
return false;
}
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
}
- std::vector<BoundSamplerKey> flat_bound_samplers;
+ std::vector<BoundSamplerEntry> flat_bound_samplers;
flat_bound_samplers.reserve(bound_samplers.size());
for (const auto& [address, sampler] : bound_samplers) {
- flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+ flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
+ }
+
+ std::vector<SeparateSamplerEntry> flat_separate_samplers;
+ flat_separate_samplers.reserve(separate_samplers.size());
+ for (const auto& [key, sampler] : separate_samplers) {
+ SeparateSamplerEntry entry;
+ std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+ std::tie(entry.offset1, entry.offset2) = key.offsets;
+ entry.sampler = sampler;
+ flat_separate_samplers.push_back(entry);
}
- std::vector<BindlessSamplerKey> flat_bindless_samplers;
+ std::vector<BindlessSamplerEntry> flat_bindless_samplers;
flat_bindless_samplers.reserve(bindless_samplers.size());
for (const auto& [address, sampler] : bindless_samplers) {
flat_bindless_samplers.push_back(
- BindlessSamplerKey{address.first, address.second, sampler});
+ BindlessSamplerEntry{address.first, address.second, sampler});
}
return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
flat_bound_samplers.size() &&
+ file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+ flat_separate_samplers.size() &&
file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
flat_bindless_samplers.size();
}
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index d5be52e40..a79cef0e9 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
VideoCommon::Shader::ComputeInfo compute_info;
VideoCommon::Shader::KeyMap keys;
VideoCommon::Shader::BoundSamplerMap bound_samplers;
+ VideoCommon::Shader::SeparateSamplerMap separate_samplers;
VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
};
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
index 6ec328c53..3655ff629 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
-#include <deque>
+#include <tuple>
#include <vector>
+
#include "common/alignment.h"
#include "common/assert.h"
#include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
namespace OpenGL {
-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
- bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
: buffer_size(size) {
gl_buffer.Create();
@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
allocate_size *= 2;
}
- if (use_persistent) {
- persistent = true;
- coherent = prefer_coherent;
- const GLbitfield flags =
- GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
- glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
- mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
- gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
- } else {
- glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+ static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+ glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+ mapped_ptr = static_cast<u8*>(
+ glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+ glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
}
}
OGLStreamBuffer::~OGLStreamBuffer() {
- if (persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
- }
+ glUnmapNamedBuffer(gl_buffer.handle);
gl_buffer.Release();
}
-GLuint OGLStreamBuffer::GetHandle() const {
- return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
- return buffer_size;
-}
-
std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
ASSERT(size <= buffer_size);
ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a
bool invalidate = false;
if (buffer_pos + size > buffer_size) {
+ MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+ glInvalidateBufferData(gl_buffer.handle);
+
buffer_pos = 0;
invalidate = true;
-
- if (persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
- }
}
- if (invalidate || !persistent) {
- MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
- GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
- (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
- (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
- mapped_ptr = static_cast<u8*>(
- glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
- mapped_offset = buffer_pos;
- }
-
- return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+ return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
}
void OGLStreamBuffer::Unmap(GLsizeiptr size) {
ASSERT(size <= mapped_size);
- if (!coherent && size > 0) {
- glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
- }
-
- if (!persistent) {
- glUnmapNamedBuffer(gl_buffer.handle);
+ if (size > 0) {
+ glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
}
buffer_pos += size;
diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h
index f8383cbd4..307a67113 100644
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@
namespace OpenGL {
+class Device;
+
class OGLStreamBuffer : private NonCopyable {
public:
- explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
- bool use_persistent = true);
+ explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
~OGLStreamBuffer();
- GLuint GetHandle() const;
- GLsizeiptr GetSize() const;
-
/*
* Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
* and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:
void Unmap(GLsizeiptr size);
+ GLuint Handle() const {
+ return gl_buffer.handle;
+ }
+
+ u64 Address() const {
+ return gpu_address;
+ }
+
+ GLsizeiptr Size() const noexcept {
+ return buffer_size;
+ }
+
private:
OGLBuffer gl_buffer;
- bool coherent = false;
- bool persistent = false;
-
+ GLuint64EXT gpu_address = 0;
GLintptr buffer_pos = 0;
GLsizeiptr buffer_size = 0;
- GLintptr mapped_offset = 0;
GLsizeiptr mapped_size = 0;
u8* mapped_ptr = nullptr;
};
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 57db5a08b..61505879b 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -263,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
target = GetTextureTarget(params.target);
texture = CreateTexture(params, target, internal_format, texture_buffer);
DecorateSurfaceName();
- main_view = CreateViewInner(
- ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
- true);
+
+ u32 num_layers = 1;
+ if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+ num_layers = params.depth;
+ }
+
+ main_view =
+ CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
}
CachedSurface::~CachedSurface() = default;
@@ -413,20 +418,23 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p
CachedSurfaceView::~CachedSurfaceView() = default;
-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
ASSERT(params.num_levels == 1);
+ if (params.target == SurfaceTarget::Texture3D) {
+ if (params.num_layers > 1) {
+ ASSERT(params.base_layer == 0);
+ glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+ } else {
+ glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+ params.base_level, params.base_layer);
+ }
+ return;
+ }
+
if (params.num_layers > 1) {
- // Layered framebuffer attachments
UNIMPLEMENTED_IF(params.base_layer != 0);
-
- switch (params.target) {
- case SurfaceTarget::Texture2DArray:
- glFramebufferTexture(target, attachment, GetTexture(), 0);
- break;
- default:
- UNIMPLEMENTED();
- }
+ glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
return;
}
@@ -434,16 +442,16 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
const GLuint texture = surface.GetTexture();
switch (surface.GetSurfaceParams().target) {
case SurfaceTarget::Texture1D:
- glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture2D:
- glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+ glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
break;
case SurfaceTarget::Texture1DArray:
case SurfaceTarget::Texture2DArray:
case SurfaceTarget::TextureCubemap:
case SurfaceTarget::TextureCubeArray:
- glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+ glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
params.base_layer);
break;
default:
@@ -500,8 +508,13 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
OGLTextureView texture_view;
texture_view.Create();
- glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
- params.num_levels, params.base_layer, params.num_layers);
+ if (target == GL_TEXTURE_3D) {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, 0, 1);
+ } else {
+ glTextureView(texture_view.handle, target, surface.texture.handle, format,
+ params.base_level, params.num_levels, params.base_layer, params.num_layers);
+ }
ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);
return texture_view;
@@ -544,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
const Tegra::Engines::Fermi2D::Config& copy_config) {
const auto& src_params{src_view->GetSurfaceParams()};
const auto& dst_params{dst_view->GetSurfaceParams()};
- UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
- UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+ UNIMPLEMENTED_IF(src_params.depth != 1);
+ UNIMPLEMENTED_IF(dst_params.depth != 1);
state_tracker.NotifyScissor0();
state_tracker.NotifyFramebuffer();
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 8a2ac8603..bfc4ddf5d 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -80,8 +80,10 @@ public:
explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy);
~CachedSurfaceView();
- /// Attaches this texture view to the current bound GL_DRAW_FRAMEBUFFER
- void Attach(GLenum attachment, GLenum target) const;
+ /// @brief Attaches this texture view to the currently bound fb_target framebuffer
+ /// @param attachment Attachment to bind textures to
+ /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER)
+ void Attach(GLenum attachment, GLenum fb_target) const;
GLuint GetTexture(Tegra::Texture::SwizzleSource x_source,
Tegra::Texture::SwizzleSource y_source,
diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h
index 994ae98eb..774e70a5b 100644
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -24,10 +24,11 @@ namespace MaxwellToGL {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
+inline GLenum VertexFormat(Maxwell::VertexAttribute attrib) {
switch (attrib.type) {
- case Maxwell::VertexAttribute::Type::UnsignedInt:
case Maxwell::VertexAttribute::Type::UnsignedNorm:
+ case Maxwell::VertexAttribute::Type::UnsignedScaled:
+ case Maxwell::VertexAttribute::Type::UnsignedInt:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_8:
case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -46,12 +47,11 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
return GL_UNSIGNED_INT;
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_UNSIGNED_INT_2_10_10_10_REV;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
}
- case Maxwell::VertexAttribute::Type::SignedInt:
+ break;
case Maxwell::VertexAttribute::Type::SignedNorm:
+ case Maxwell::VertexAttribute::Type::SignedScaled:
+ case Maxwell::VertexAttribute::Type::SignedInt:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_8:
case Maxwell::VertexAttribute::Size::Size_8_8:
@@ -70,10 +70,8 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
return GL_INT;
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return GL_INT_2_10_10_10_REV;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
}
+ break;
case Maxwell::VertexAttribute::Type::Float:
switch (attrib.size) {
case Maxwell::VertexAttribute::Size::Size_16:
@@ -86,46 +84,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) {
case Maxwell::VertexAttribute::Size::Size_32_32_32:
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return GL_FLOAT;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
- }
- case Maxwell::VertexAttribute::Type::UnsignedScaled:
- switch (attrib.size) {
- case Maxwell::VertexAttribute::Size::Size_8:
- case Maxwell::VertexAttribute::Size::Size_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return GL_UNSIGNED_BYTE;
- case Maxwell::VertexAttribute::Size::Size_16:
- case Maxwell::VertexAttribute::Size::Size_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return GL_UNSIGNED_SHORT;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
}
- case Maxwell::VertexAttribute::Type::SignedScaled:
- switch (attrib.size) {
- case Maxwell::VertexAttribute::Size::Size_8:
- case Maxwell::VertexAttribute::Size::Size_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8:
- case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return GL_BYTE;
- case Maxwell::VertexAttribute::Size::Size_16:
- case Maxwell::VertexAttribute::Size::Size_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16:
- case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return GL_SHORT;
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString());
- return {};
- }
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString());
- return {};
+ break;
}
+ UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", attrib.TypeString(),
+ attrib.SizeString());
+ return {};
}
inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
@@ -137,8 +101,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) {
case Maxwell::IndexFormat::UnsignedInt:
return GL_UNSIGNED_INT;
}
- LOG_CRITICAL(Render_OpenGL, "Unimplemented index_format={}", static_cast<u32>(index_format));
- UNREACHABLE();
+ UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format));
return {};
}
@@ -180,33 +143,32 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
}
inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode,
- Tegra::Texture::TextureMipmapFilter mip_filter_mode) {
+ Tegra::Texture::TextureMipmapFilter mipmap_filter_mode) {
switch (filter_mode) {
- case Tegra::Texture::TextureFilter::Linear: {
- switch (mip_filter_mode) {
+ case Tegra::Texture::TextureFilter::Nearest:
+ switch (mipmap_filter_mode) {
case Tegra::Texture::TextureMipmapFilter::None:
- return GL_LINEAR;
+ return GL_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Nearest:
- return GL_LINEAR_MIPMAP_NEAREST;
+ return GL_NEAREST_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
- return GL_LINEAR_MIPMAP_LINEAR;
+ return GL_NEAREST_MIPMAP_LINEAR;
}
break;
- }
- case Tegra::Texture::TextureFilter::Nearest: {
- switch (mip_filter_mode) {
+ case Tegra::Texture::TextureFilter::Linear:
+ switch (mipmap_filter_mode) {
case Tegra::Texture::TextureMipmapFilter::None:
- return GL_NEAREST;
+ return GL_LINEAR;
case Tegra::Texture::TextureMipmapFilter::Nearest:
- return GL_NEAREST_MIPMAP_NEAREST;
+ return GL_LINEAR_MIPMAP_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Linear:
- return GL_NEAREST_MIPMAP_LINEAR;
+ return GL_LINEAR_MIPMAP_LINEAR;
}
break;
}
- }
- LOG_ERROR(Render_OpenGL, "Unimplemented texture filter mode={}", static_cast<u32>(filter_mode));
- return GL_LINEAR;
+ UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}",
+ static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode));
+ return GL_NEAREST;
}
inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
@@ -229,10 +191,9 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
} else {
return GL_MIRROR_CLAMP_TO_EDGE;
}
- default:
- LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
- return GL_REPEAT;
}
+ UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode));
+ return GL_REPEAT;
}
inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
@@ -254,8 +215,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) {
case Tegra::Texture::DepthCompareFunc::Always:
return GL_ALWAYS;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented texture depth compare function ={}",
- static_cast<u32>(func));
+ UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func));
return GL_GREATER;
}
@@ -277,7 +237,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) {
case Maxwell::Blend::Equation::MaxGL:
return GL_MAX;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented blend equation={}", static_cast<u32>(equation));
+ UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation));
return GL_FUNC_ADD;
}
@@ -341,7 +301,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) {
case Maxwell::Blend::Factor::OneMinusConstantAlphaGL:
return GL_ONE_MINUS_CONSTANT_ALPHA;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented blend factor={}", static_cast<u32>(factor));
+ UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor));
return GL_ZERO;
}
@@ -361,7 +321,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) {
case Tegra::Texture::SwizzleSource::OneFloat:
return GL_ONE;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented swizzle source={}", static_cast<u32>(source));
+ UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source));
return GL_ZERO;
}
@@ -392,7 +352,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) {
case Maxwell::ComparisonOp::AlwaysOld:
return GL_ALWAYS;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented comparison op={}", static_cast<u32>(comparison));
+ UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison));
return GL_ALWAYS;
}
@@ -423,7 +383,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) {
case Maxwell::StencilOp::DecrWrapOGL:
return GL_DECR_WRAP;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented stencil op={}", static_cast<u32>(stencil));
+ UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil));
return GL_KEEP;
}
@@ -434,7 +394,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) {
case Maxwell::FrontFace::CounterClockWise:
return GL_CCW;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented front face cull={}", static_cast<u32>(front_face));
+ UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face));
return GL_CCW;
}
@@ -447,7 +407,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) {
case Maxwell::CullFace::FrontAndBack:
return GL_FRONT_AND_BACK;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented cull face={}", static_cast<u32>(cull_face));
+ UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face));
return GL_BACK;
}
@@ -486,7 +446,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
case Maxwell::LogicOperation::Set:
return GL_SET;
}
- LOG_ERROR(Render_OpenGL, "Unimplemented logic operation={}", static_cast<u32>(operation));
+ UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation));
return GL_COPY;
}
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 6214fcbc3..c40adb6e7 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -488,6 +488,15 @@ void RendererOpenGL::InitOpenGLObjects() {
// Clear screen to black
LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
+
+ // Enable unified vertex attributes and query vertex buffer address when the driver supports it
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
+
+ glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
+ glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
+ &vertex_buffer_address);
+ }
}
void RendererOpenGL::AddTelemetryFields() {
@@ -656,7 +665,13 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
offsetof(ScreenRectVertex, tex_coord));
glVertexAttribBinding(PositionLocation, 0);
glVertexAttribBinding(TexCoordLocation, 0);
- glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+ if (device.HasVertexBufferUnifiedMemory()) {
+ glBindVertexBuffer(0, 0, 0, sizeof(ScreenRectVertex));
+ glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0, vertex_buffer_address,
+ sizeof(vertices));
+ } else {
+ glBindVertexBuffer(0, vertex_buffer.handle, 0, sizeof(ScreenRectVertex));
+ }
glBindTextureUnit(0, screen_info.display_texture);
glBindSampler(0, 0);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 61bf507f4..8b18d32e6 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -107,6 +107,9 @@ private:
OGLPipeline pipeline;
OGLFramebuffer screenshot_framebuffer;
+ // GPU address of the vertex buffer
+ GLuint64EXT vertex_buffer_address = 0;
+
/// Display information for Switch screen
ScreenInfo screen_info;
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
index 62e950d31..d7f1ae89f 100644
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -21,29 +21,29 @@ namespace Sampler {
VkFilter Filter(Tegra::Texture::TextureFilter filter) {
switch (filter) {
- case Tegra::Texture::TextureFilter::Linear:
- return VK_FILTER_LINEAR;
case Tegra::Texture::TextureFilter::Nearest:
return VK_FILTER_NEAREST;
+ case Tegra::Texture::TextureFilter::Linear:
+ return VK_FILTER_LINEAR;
}
- UNIMPLEMENTED_MSG("Unimplemented sampler filter={}", static_cast<u32>(filter));
+ UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter));
return {};
}
VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter) {
switch (mipmap_filter) {
case Tegra::Texture::TextureMipmapFilter::None:
- // TODO(Rodrigo): None seems to be mapped to OpenGL's mag and min filters without mipmapping
- // (e.g. GL_NEAREST and GL_LINEAR). Vulkan doesn't have such a thing, find out if we have to
- // use an image view with a single mipmap level to emulate this.
- return VK_SAMPLER_MIPMAP_MODE_LINEAR;
- ;
- case Tegra::Texture::TextureMipmapFilter::Linear:
- return VK_SAMPLER_MIPMAP_MODE_LINEAR;
+ // There are no Vulkan filter modes that directly correspond to OpenGL minification filters
+ // of GL_LINEAR or GL_NEAREST, but they can be emulated using
+ // VK_SAMPLER_MIPMAP_MODE_NEAREST, minLod = 0, and maxLod = 0.25, and using minFilter =
+ // VK_FILTER_LINEAR or minFilter = VK_FILTER_NEAREST, respectively.
+ return VK_SAMPLER_MIPMAP_MODE_NEAREST;
case Tegra::Texture::TextureMipmapFilter::Nearest:
return VK_SAMPLER_MIPMAP_MODE_NEAREST;
+ case Tegra::Texture::TextureMipmapFilter::Linear:
+ return VK_SAMPLER_MIPMAP_MODE_LINEAR;
}
- UNIMPLEMENTED_MSG("Unimplemented sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
+ UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter));
return {};
}
@@ -78,10 +78,9 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w
case Tegra::Texture::WrapMode::MirrorOnceBorder:
UNIMPLEMENTED();
return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE;
- default:
- UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
- return {};
}
+ UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode));
+ return {};
}
VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) {
@@ -288,14 +287,35 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
case Maxwell::PrimitiveTopology::Patches:
return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
- default:
- UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
- return {};
}
+ UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
+ return {};
}
VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
switch (type) {
+ case Maxwell::VertexAttribute::Type::UnsignedNorm:
+ switch (size) {
+ case Maxwell::VertexAttribute::Size::Size_8:
+ return VK_FORMAT_R8_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_8_8:
+ return VK_FORMAT_R8G8_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_8_8_8:
+ return VK_FORMAT_R8G8B8_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
+ return VK_FORMAT_R8G8B8A8_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_16:
+ return VK_FORMAT_R16_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_16_16:
+ return VK_FORMAT_R16G16_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_16_16_16:
+ return VK_FORMAT_R16G16B16_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
+ return VK_FORMAT_R16G16B16A16_UNORM;
+ case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+ return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
+ }
+ break;
case Maxwell::VertexAttribute::Type::SignedNorm:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
@@ -316,62 +336,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
return VK_FORMAT_R16G16B16A16_SNORM;
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return VK_FORMAT_A2B10G10R10_SNORM_PACK32;
- default:
- break;
}
break;
- case Maxwell::VertexAttribute::Type::UnsignedNorm:
+ case Maxwell::VertexAttribute::Type::UnsignedScaled:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
- return VK_FORMAT_R8_UNORM;
+ return VK_FORMAT_R8_USCALED;
case Maxwell::VertexAttribute::Size::Size_8_8:
- return VK_FORMAT_R8G8_UNORM;
+ return VK_FORMAT_R8G8_USCALED;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
- return VK_FORMAT_R8G8B8_UNORM;
+ return VK_FORMAT_R8G8B8_USCALED;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return VK_FORMAT_R8G8B8A8_UNORM;
+ return VK_FORMAT_R8G8B8A8_USCALED;
case Maxwell::VertexAttribute::Size::Size_16:
- return VK_FORMAT_R16_UNORM;
+ return VK_FORMAT_R16_USCALED;
case Maxwell::VertexAttribute::Size::Size_16_16:
- return VK_FORMAT_R16G16_UNORM;
+ return VK_FORMAT_R16G16_USCALED;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
- return VK_FORMAT_R16G16B16_UNORM;
+ return VK_FORMAT_R16G16B16_USCALED;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return VK_FORMAT_R16G16B16A16_UNORM;
+ return VK_FORMAT_R16G16B16A16_USCALED;
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
- return VK_FORMAT_A2B10G10R10_UNORM_PACK32;
- default:
- break;
+ return VK_FORMAT_A2B10G10R10_USCALED_PACK32;
}
break;
- case Maxwell::VertexAttribute::Type::SignedInt:
+ case Maxwell::VertexAttribute::Type::SignedScaled:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
- return VK_FORMAT_R8_SINT;
+ return VK_FORMAT_R8_SSCALED;
case Maxwell::VertexAttribute::Size::Size_8_8:
- return VK_FORMAT_R8G8_SINT;
+ return VK_FORMAT_R8G8_SSCALED;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
- return VK_FORMAT_R8G8B8_SINT;
+ return VK_FORMAT_R8G8B8_SSCALED;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return VK_FORMAT_R8G8B8A8_SINT;
+ return VK_FORMAT_R8G8B8A8_SSCALED;
case Maxwell::VertexAttribute::Size::Size_16:
- return VK_FORMAT_R16_SINT;
+ return VK_FORMAT_R16_SSCALED;
case Maxwell::VertexAttribute::Size::Size_16_16:
- return VK_FORMAT_R16G16_SINT;
+ return VK_FORMAT_R16G16_SSCALED;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
- return VK_FORMAT_R16G16B16_SINT;
+ return VK_FORMAT_R16G16B16_SSCALED;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return VK_FORMAT_R16G16B16A16_SINT;
- case Maxwell::VertexAttribute::Size::Size_32:
- return VK_FORMAT_R32_SINT;
- case Maxwell::VertexAttribute::Size::Size_32_32:
- return VK_FORMAT_R32G32_SINT;
- case Maxwell::VertexAttribute::Size::Size_32_32_32:
- return VK_FORMAT_R32G32B32_SINT;
- case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
- return VK_FORMAT_R32G32B32A32_SINT;
- default:
- break;
+ return VK_FORMAT_R16G16B16A16_SSCALED;
+ case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+ return VK_FORMAT_A2B10G10R10_SSCALED_PACK32;
}
break;
case Maxwell::VertexAttribute::Type::UnsignedInt:
@@ -400,56 +408,50 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
return VK_FORMAT_R32G32B32_UINT;
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return VK_FORMAT_R32G32B32A32_UINT;
- default:
- break;
+ case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+ return VK_FORMAT_A2B10G10R10_UINT_PACK32;
}
break;
- case Maxwell::VertexAttribute::Type::UnsignedScaled:
+ case Maxwell::VertexAttribute::Type::SignedInt:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
- return VK_FORMAT_R8_USCALED;
+ return VK_FORMAT_R8_SINT;
case Maxwell::VertexAttribute::Size::Size_8_8:
- return VK_FORMAT_R8G8_USCALED;
+ return VK_FORMAT_R8G8_SINT;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
- return VK_FORMAT_R8G8B8_USCALED;
+ return VK_FORMAT_R8G8B8_SINT;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return VK_FORMAT_R8G8B8A8_USCALED;
+ return VK_FORMAT_R8G8B8A8_SINT;
case Maxwell::VertexAttribute::Size::Size_16:
- return VK_FORMAT_R16_USCALED;
+ return VK_FORMAT_R16_SINT;
case Maxwell::VertexAttribute::Size::Size_16_16:
- return VK_FORMAT_R16G16_USCALED;
+ return VK_FORMAT_R16G16_SINT;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
- return VK_FORMAT_R16G16B16_USCALED;
+ return VK_FORMAT_R16G16B16_SINT;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return VK_FORMAT_R16G16B16A16_USCALED;
- default:
- break;
+ return VK_FORMAT_R16G16B16A16_SINT;
+ case Maxwell::VertexAttribute::Size::Size_32:
+ return VK_FORMAT_R32_SINT;
+ case Maxwell::VertexAttribute::Size::Size_32_32:
+ return VK_FORMAT_R32G32_SINT;
+ case Maxwell::VertexAttribute::Size::Size_32_32_32:
+ return VK_FORMAT_R32G32B32_SINT;
+ case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
+ return VK_FORMAT_R32G32B32A32_SINT;
+ case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+ return VK_FORMAT_A2B10G10R10_SINT_PACK32;
}
break;
- case Maxwell::VertexAttribute::Type::SignedScaled:
+ case Maxwell::VertexAttribute::Type::Float:
switch (size) {
- case Maxwell::VertexAttribute::Size::Size_8:
- return VK_FORMAT_R8_SSCALED;
- case Maxwell::VertexAttribute::Size::Size_8_8:
- return VK_FORMAT_R8G8_SSCALED;
- case Maxwell::VertexAttribute::Size::Size_8_8_8:
- return VK_FORMAT_R8G8B8_SSCALED;
- case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
- return VK_FORMAT_R8G8B8A8_SSCALED;
case Maxwell::VertexAttribute::Size::Size_16:
- return VK_FORMAT_R16_SSCALED;
+ return VK_FORMAT_R16_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_16_16:
- return VK_FORMAT_R16G16_SSCALED;
+ return VK_FORMAT_R16G16_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
- return VK_FORMAT_R16G16B16_SSCALED;
+ return VK_FORMAT_R16G16B16_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return VK_FORMAT_R16G16B16A16_SSCALED;
- default:
- break;
- }
- break;
- case Maxwell::VertexAttribute::Type::Float:
- switch (size) {
+ return VK_FORMAT_R16G16B16A16_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_32:
return VK_FORMAT_R32_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_32_32:
@@ -458,16 +460,6 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
return VK_FORMAT_R32G32B32_SFLOAT;
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return VK_FORMAT_R32G32B32A32_SFLOAT;
- case Maxwell::VertexAttribute::Size::Size_16:
- return VK_FORMAT_R16_SFLOAT;
- case Maxwell::VertexAttribute::Size::Size_16_16:
- return VK_FORMAT_R16G16_SFLOAT;
- case Maxwell::VertexAttribute::Size::Size_16_16_16:
- return VK_FORMAT_R16G16B16_SFLOAT;
- case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
- return VK_FORMAT_R16G16B16A16_SFLOAT;
- default:
- break;
}
break;
}
diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
index 59b441943..2d9b18ed9 100644
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -13,6 +13,7 @@
#include <fmt/format.h>
#include "common/dynamic_library.h"
+#include "common/file_util.h"
#include "common/logging/log.h"
#include "common/telemetry.h"
#include "core/core.h"
@@ -76,7 +77,8 @@ Common::DynamicLibrary OpenVulkanLibrary() {
char* libvulkan_env = getenv("LIBVULKAN_PATH");
if (!libvulkan_env || !library.Open(libvulkan_env)) {
// Use the libvulkan.dylib from the application bundle.
- std::string filename = File::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
+ const std::string filename =
+ FileUtil::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib";
library.Open(filename.c_str());
}
#else
@@ -153,11 +155,31 @@ vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatc
}
}
- static constexpr std::array layers_data{"VK_LAYER_LUNARG_standard_validation"};
- vk::Span<const char*> layers = layers_data;
- if (!enable_layers) {
- layers = {};
+ std::vector<const char*> layers;
+ layers.reserve(1);
+ if (enable_layers) {
+ layers.push_back("VK_LAYER_KHRONOS_validation");
+ }
+
+ const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld);
+ if (!layer_properties) {
+ LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers");
+ layers.clear();
+ }
+
+ for (auto layer_it = layers.begin(); layer_it != layers.end();) {
+ const char* const layer = *layer_it;
+ const auto it = std::find_if(
+ layer_properties->begin(), layer_properties->end(),
+ [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); });
+ if (it == layer_properties->end()) {
+ LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer);
+ layer_it = layers.erase(layer_it);
+ } else {
+ ++layer_it;
+ }
}
+
vk::Instance instance = vk::Instance::Create(layers, extensions, dld);
if (!instance) {
LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance");
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index 5f33d9e40..f10f96cd8 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -37,9 +37,9 @@ std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKSch
} // Anonymous namespace
-CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
- VAddr cpu_addr, std::size_t size)
- : VideoCommon::BufferBlock{cpu_addr, size} {
+Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_,
+ VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size)
+ : VideoCommon::BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} {
VkBufferCreateInfo ci;
ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
ci.pNext = nullptr;
@@ -54,46 +54,17 @@ CachedBufferBlock::CachedBufferBlock(const VKDevice& device, VKMemoryManager& me
buffer.commit = memory_manager.Commit(buffer.handle, false);
}
-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
-VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
- const VKDevice& device, VKMemoryManager& memory_manager,
- VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
- : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
- CreateStreamBuffer(device,
- scheduler)},
- device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
- staging_pool} {}
-
-VKBufferCache::~VKBufferCache() = default;
-
-Buffer VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
- return std::make_shared<CachedBufferBlock>(device, memory_manager, cpu_addr, size);
-}
-
-VkBuffer VKBufferCache::ToHandle(const Buffer& buffer) {
- return buffer->GetHandle();
-}
-
-VkBuffer VKBufferCache::GetEmptyBuffer(std::size_t size) {
- size = std::max(size, std::size_t(4));
- const auto& empty = staging_pool.GetUnusedBuffer(size, false);
- scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
- cmdbuf.FillBuffer(buffer, 0, size, 0);
- });
- return *empty.handle;
-}
-
-void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) {
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
const auto& staging = staging_pool.GetUnusedBuffer(size, true);
std::memcpy(staging.commit->Map(size), data, size);
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
- size](vk::CommandBuffer cmdbuf) {
- cmdbuf.CopyBuffer(staging, buffer, VkBufferCopy{0, offset, size});
+
+ const VkBuffer handle = Handle();
+ scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
+ cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size});
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
@@ -102,7 +73,7 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
barrier.dstAccessMask = UPLOAD_ACCESS_BARRIERS;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barrier.buffer = buffer;
+ barrier.buffer = handle;
barrier.offset = offset;
barrier.size = size;
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
@@ -110,12 +81,12 @@ void VKBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, st
});
}
-void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) {
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
const auto& staging = staging_pool.GetUnusedBuffer(size, true);
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([staging = *staging.handle, buffer = buffer->GetHandle(), offset,
- size](vk::CommandBuffer cmdbuf) {
+
+ const VkBuffer handle = Handle();
+ scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) {
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
@@ -123,7 +94,7 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
- barrier.buffer = buffer;
+ barrier.buffer = handle;
barrier.offset = offset;
barrier.size = size;
@@ -131,18 +102,20 @@ void VKBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset,
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
- cmdbuf.CopyBuffer(buffer, staging, VkBufferCopy{offset, 0, size});
+ cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size});
});
scheduler.Finish();
std::memcpy(data, staging.commit->Map(size), size);
}
-void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) {
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size) const {
scheduler.RequestOutsideRenderPassOperationContext();
- scheduler.Record([src_buffer = src->GetHandle(), dst_buffer = dst->GetHandle(), src_offset,
- dst_offset, size](vk::CommandBuffer cmdbuf) {
+
+ const VkBuffer dst_buffer = Handle();
+ scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
+ size](vk::CommandBuffer cmdbuf) {
cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size});
std::array<VkBufferMemoryBarrier, 2> barriers;
@@ -169,4 +142,30 @@ void VKBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
});
}
+VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+ const VKDevice& device, VKMemoryManager& memory_manager,
+ VKScheduler& scheduler, VKStagingBufferPool& staging_pool)
+ : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, system,
+ CreateStreamBuffer(device,
+ scheduler)},
+ device{device}, memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{
+ staging_pool} {}
+
+VKBufferCache::~VKBufferCache() = default;
+
+std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+ return std::make_shared<Buffer>(device, memory_manager, scheduler, staging_pool, cpu_addr,
+ size);
+}
+
+VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
+ size = std::max(size, std::size_t(4));
+ const auto& empty = staging_pool.GetUnusedBuffer(size, false);
+ scheduler.RequestOutsideRenderPassOperationContext();
+ scheduler.Record([size, buffer = *empty.handle](vk::CommandBuffer cmdbuf) {
+ cmdbuf.FillBuffer(buffer, 0, size, 0);
+ });
+ return {*empty.handle, 0, 0};
+}
+
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index a54583e7d..3630aca77 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -8,7 +8,6 @@
#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_vulkan/vk_memory_manager.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
@@ -24,22 +23,34 @@ class VKDevice;
class VKMemoryManager;
class VKScheduler;
-class CachedBufferBlock final : public VideoCommon::BufferBlock {
+class Buffer final : public VideoCommon::BufferBlock {
public:
- explicit CachedBufferBlock(const VKDevice& device, VKMemoryManager& memory_manager,
- VAddr cpu_addr, std::size_t size);
- ~CachedBufferBlock();
+ explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler,
+ VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size);
+ ~Buffer();
- VkBuffer GetHandle() const {
+ void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+ void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+ void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+ std::size_t size) const;
+
+ VkBuffer Handle() const {
return *buffer.handle;
}
+ u64 Address() const {
+ return 0;
+ }
+
private:
+ VKScheduler& scheduler;
+ VKStagingBufferPool& staging_pool;
+
VKBuffer buffer;
};
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-
class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> {
public:
explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
@@ -47,21 +58,10 @@ public:
VKScheduler& scheduler, VKStagingBufferPool& staging_pool);
~VKBufferCache();
- VkBuffer GetEmptyBuffer(std::size_t size) override;
+ BufferInfo GetEmptyBuffer(std::size_t size) override;
protected:
- VkBuffer ToHandle(const Buffer& buffer) override;
-
- Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
- void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- const u8* data) override;
-
- void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
- u8* data) override;
-
- void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
- std::size_t dst_offset, std::size_t size) override;
+ std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
private:
const VKDevice& device;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index b8ccf164f..ea66e621e 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -27,6 +27,7 @@
#include "video_core/renderer_vulkan/wrapper.h"
#include "video_core/shader/compiler_settings.h"
#include "video_core/shader/memory_util.h"
+#include "video_core/shader_cache.h"
namespace Vulkan {
@@ -132,19 +133,18 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con
return std::memcmp(&rhs, this, sizeof *this) == 0;
}
-CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stage,
- GPUVAddr gpu_addr, VAddr cpu_addr, ProgramCode program_code,
- u32 main_offset)
- : RasterizerCacheObject{cpu_addr}, gpu_addr{gpu_addr}, program_code{std::move(program_code)},
+Shader::Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+ VideoCommon::Shader::ProgramCode program_code, u32 main_offset)
+ : gpu_addr{gpu_addr}, program_code{std::move(program_code)},
registry{stage, GetEngine(system, stage)}, shader_ir{this->program_code, main_offset,
compiler_settings, registry},
entries{GenerateShaderEntries(shader_ir)} {}
-CachedShader::~CachedShader() = default;
+Shader::~Shader() = default;
-Tegra::Engines::ConstBufferEngineInterface& CachedShader::GetEngine(
- Core::System& system, Tegra::Engines::ShaderType stage) {
- if (stage == Tegra::Engines::ShaderType::Compute) {
+Tegra::Engines::ConstBufferEngineInterface& Shader::GetEngine(Core::System& system,
+ Tegra::Engines::ShaderType stage) {
+ if (stage == ShaderType::Compute) {
return system.GPU().KeplerCompute();
} else {
return system.GPU().Maxwell3D();
@@ -156,16 +156,16 @@ VKPipelineCache::VKPipelineCache(Core::System& system, RasterizerVulkan& rasteri
VKDescriptorPool& descriptor_pool,
VKUpdateDescriptorQueue& update_descriptor_queue,
VKRenderPassCache& renderpass_cache)
- : RasterizerCache{rasterizer}, system{system}, device{device}, scheduler{scheduler},
- descriptor_pool{descriptor_pool}, update_descriptor_queue{update_descriptor_queue},
- renderpass_cache{renderpass_cache} {}
+ : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system}, device{device},
+ scheduler{scheduler}, descriptor_pool{descriptor_pool},
+ update_descriptor_queue{update_descriptor_queue}, renderpass_cache{renderpass_cache} {}
VKPipelineCache::~VKPipelineCache() = default;
-std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
+std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
const auto& gpu = system.GPU().Maxwell3D();
- std::array<Shader, Maxwell::MaxShaderProgram> shaders;
+ std::array<Shader*, Maxwell::MaxShaderProgram> shaders{};
for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
const auto program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -178,24 +178,28 @@ std::array<Shader, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() {
const GPUVAddr program_addr{GetShaderAddress(system, program)};
const std::optional cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
ASSERT(cpu_addr);
- auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
- if (!shader) {
+
+ Shader* result = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
+ if (!result) {
const auto host_ptr{memory_manager.GetPointer(program_addr)};
// No shader found - create a new one
constexpr u32 stage_offset = STAGE_MAIN_OFFSET;
- const auto stage = static_cast<Tegra::Engines::ShaderType>(index == 0 ? 0 : index - 1);
+ const auto stage = static_cast<ShaderType>(index == 0 ? 0 : index - 1);
ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, false);
+ const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+ auto shader = std::make_unique<Shader>(system, stage, program_addr, std::move(code),
+ stage_offset);
+ result = shader.get();
- shader = std::make_shared<CachedShader>(system, stage, program_addr, *cpu_addr,
- std::move(code), stage_offset);
if (cpu_addr) {
- Register(shader);
+ Register(std::move(shader), *cpu_addr, size_in_bytes);
} else {
- null_shader = shader;
+ null_shader = std::move(shader);
}
}
- shaders[index] = std::move(shader);
+ shaders[index] = result;
}
return last_shaders = shaders;
}
@@ -236,19 +240,22 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
const auto cpu_addr = memory_manager.GpuToCpuAddress(program_addr);
ASSERT(cpu_addr);
- auto shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
+ Shader* shader = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get();
if (!shader) {
// No shader found - create a new one
const auto host_ptr = memory_manager.GetPointer(program_addr);
ProgramCode code = GetShaderCode(memory_manager, program_addr, host_ptr, true);
- shader = std::make_shared<CachedShader>(system, Tegra::Engines::ShaderType::Compute,
- program_addr, *cpu_addr, std::move(code),
- KERNEL_MAIN_OFFSET);
+ const std::size_t size_in_bytes = code.size() * sizeof(u64);
+
+ auto shader_info = std::make_unique<Shader>(system, ShaderType::Compute, program_addr,
+ std::move(code), KERNEL_MAIN_OFFSET);
+ shader = shader_info.get();
+
if (cpu_addr) {
- Register(shader);
+ Register(std::move(shader_info), *cpu_addr, size_in_bytes);
} else {
- null_kernel = shader;
+ null_kernel = std::move(shader_info);
}
}
@@ -264,7 +271,7 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
return *entry;
}
-void VKPipelineCache::Unregister(const Shader& shader) {
+void VKPipelineCache::OnShaderRemoval(Shader* shader) {
bool finished = false;
const auto Finish = [&] {
// TODO(Rodrigo): Instead of finishing here, wait for the fences that use this pipeline and
@@ -296,8 +303,6 @@ void VKPipelineCache::Unregister(const Shader& shader) {
Finish();
it = compute_cache.erase(it);
}
-
- RasterizerCache::Unregister(shader);
}
std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>>
@@ -332,12 +337,11 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
}
const GPUVAddr gpu_addr = GetShaderAddress(system, program_enum);
- const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
- const auto shader = cpu_addr ? TryGet(*cpu_addr) : null_shader;
- ASSERT(shader);
+ const std::optional<VAddr> cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+ Shader* const shader = cpu_addr ? TryGet(*cpu_addr) : null_shader.get();
const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
- const auto program_type = GetShaderType(program_enum);
+ const ShaderType program_type = GetShaderType(program_enum);
const auto& entries = shader->GetEntries();
program[stage] = {
Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index 0b5796fef..0a36e5112 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -17,7 +17,6 @@
#include "common/common_types.h"
#include "video_core/engines/const_buffer_engine_interface.h"
#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
#include "video_core/renderer_vulkan/fixed_pipeline_state.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_renderpass_cache.h"
@@ -26,6 +25,7 @@
#include "video_core/shader/memory_util.h"
#include "video_core/shader/registry.h"
#include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"
namespace Core {
class System;
@@ -41,8 +41,6 @@ class VKFence;
class VKScheduler;
class VKUpdateDescriptorQueue;
-class CachedShader;
-using Shader = std::shared_ptr<CachedShader>;
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
struct GraphicsPipelineCacheKey {
@@ -102,21 +100,16 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
namespace Vulkan {
-class CachedShader final : public RasterizerCacheObject {
+class Shader {
public:
- explicit CachedShader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
- VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code,
- u32 main_offset);
- ~CachedShader();
+ explicit Shader(Core::System& system, Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr,
+ VideoCommon::Shader::ProgramCode program_code, u32 main_offset);
+ ~Shader();
GPUVAddr GetGpuAddr() const {
return gpu_addr;
}
- std::size_t GetSizeInBytes() const override {
- return program_code.size() * sizeof(u64);
- }
-
VideoCommon::Shader::ShaderIR& GetIR() {
return shader_ir;
}
@@ -144,25 +137,23 @@ private:
ShaderEntries entries;
};
-class VKPipelineCache final : public RasterizerCache<Shader> {
+class VKPipelineCache final : public VideoCommon::ShaderCache<Shader> {
public:
explicit VKPipelineCache(Core::System& system, RasterizerVulkan& rasterizer,
const VKDevice& device, VKScheduler& scheduler,
VKDescriptorPool& descriptor_pool,
VKUpdateDescriptorQueue& update_descriptor_queue,
VKRenderPassCache& renderpass_cache);
- ~VKPipelineCache();
+ ~VKPipelineCache() override;
- std::array<Shader, Maxwell::MaxShaderProgram> GetShaders();
+ std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders();
VKGraphicsPipeline& GetGraphicsPipeline(const GraphicsPipelineCacheKey& key);
VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key);
protected:
- void Unregister(const Shader& shader) override;
-
- void FlushObjectInner(const Shader& object) override {}
+ void OnShaderRemoval(Shader* shader) final;
private:
std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> DecompileShaders(
@@ -175,10 +166,10 @@ private:
VKUpdateDescriptorQueue& update_descriptor_queue;
VKRenderPassCache& renderpass_cache;
- Shader null_shader{};
- Shader null_kernel{};
+ std::unique_ptr<Shader> null_shader;
+ std::unique_ptr<Shader> null_kernel;
- std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+ std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
GraphicsPipelineCacheKey last_graphics_key;
VKGraphicsPipeline* last_graphics_pipeline = nullptr;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index d86c46412..a8d94eac3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -38,6 +38,7 @@
#include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/renderer_vulkan/wrapper.h"
+#include "video_core/shader_cache.h"
namespace Vulkan {
@@ -98,7 +99,7 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) {
}
std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses(
- const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+ const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses;
for (std::size_t i = 0; i < std::size(addresses); ++i) {
addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0;
@@ -117,6 +118,17 @@ template <typename Engine, typename Entry>
Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
std::size_t stage, std::size_t index = 0) {
const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage);
+ if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+ if (entry.is_separated) {
+ const u32 buffer_1 = entry.buffer;
+ const u32 buffer_2 = entry.secondary_buffer;
+ const u32 offset_1 = entry.offset;
+ const u32 offset_2 = entry.secondary_offset;
+ const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1);
+ const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2);
+ return engine.GetTextureInfo(handle_1 | handle_2);
+ }
+ }
if (entry.is_bindless) {
const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset);
return engine.GetTextureInfo(tex_handle);
@@ -131,6 +143,49 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
}
}
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) {
+ if (!is_clear) {
+ return true;
+ }
+ // First we have to make sure all clear masks are enabled.
+ if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B ||
+ !regs.clear_buffers.A) {
+ return true;
+ }
+ // If scissors are disabled, the whole screen is cleared
+ if (!regs.clear_flags.scissor) {
+ return false;
+ }
+ // Then we have to confirm scissor testing clears the whole image
+ const std::size_t index = regs.clear_buffers.RT;
+ const auto& scissor = regs.scissor_test[0];
+ return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width ||
+ scissor.max_y < regs.rt[index].height;
+}
+
+/// @brief Determine if an attachment to be updated has to preserve contents
+/// @param is_clear True when a clear is being executed
+/// @param regs 3D registers
+/// @return True when the contents have to be preserved
+bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) {
+ // If we are not clearing, the contents have to be preserved
+ if (!is_clear) {
+ return true;
+ }
+ // For depth stencil clears we only have to confirm scissor test covers the whole image
+ if (!regs.clear_flags.scissor) {
+ return false;
+ }
+ // Make sure the clear cover the whole image
+ const auto& scissor = regs.scissor_test[0];
+ return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width ||
+ scissor.max_y < regs.zeta_height;
+}
+
} // Anonymous namespace
class BufferBindings final {
@@ -332,7 +387,7 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
buffer_cache.Unmap();
- const Texceptions texceptions = UpdateAttachments();
+ const Texceptions texceptions = UpdateAttachments(false);
SetupImageTransitions(texceptions, color_attachments, zeta_attachment);
key.renderpass_params = GetRenderPassParams(texceptions);
@@ -388,7 +443,7 @@ void RasterizerVulkan::Clear() {
return;
}
- [[maybe_unused]] const auto texceptions = UpdateAttachments();
+ [[maybe_unused]] const auto texceptions = UpdateAttachments(true);
DEBUG_ASSERT(texceptions.none());
SetupImageTransitions(0, color_attachments, zeta_attachment);
@@ -665,9 +720,12 @@ void RasterizerVulkan::FlushWork() {
draw_counter = 0;
}
-RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
+RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) {
MICROPROFILE_SCOPE(Vulkan_RenderTargets);
- auto& dirty = system.GPU().Maxwell3D().dirty.flags;
+ auto& maxwell3d = system.GPU().Maxwell3D();
+ auto& dirty = maxwell3d.dirty.flags;
+ auto& regs = maxwell3d.regs;
+
const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets];
dirty[VideoCommon::Dirty::RenderTargets] = false;
@@ -676,7 +734,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
Texceptions texceptions;
for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
if (update_rendertargets) {
- color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, true);
+ const bool preserve_contents = HasToPreserveColorContents(is_clear, regs);
+ color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents);
}
if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) {
texceptions[rt] = true;
@@ -684,7 +743,8 @@ RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments() {
}
if (update_rendertargets) {
- zeta_attachment = texture_cache.GetDepthBufferSurface(true);
+ const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs);
+ zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents);
}
if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) {
texceptions[ZETA_TEXCEPTION_INDEX] = true;
@@ -716,7 +776,7 @@ std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers(
if (!view) {
return false;
}
- key.views.push_back(view->GetHandle());
+ key.views.push_back(view->GetAttachment());
key.width = std::min(key.width, view->GetWidth());
key.height = std::min(key.height, view->GetHeight());
key.layers = std::min(key.layers, view->GetNumLayers());
@@ -776,12 +836,12 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt
}
void RasterizerVulkan::SetupShaderDescriptors(
- const std::array<Shader, Maxwell::MaxShaderProgram>& shaders) {
+ const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) {
texture_cache.GuardSamplers(true);
for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
// Skip VertexA stage
- const auto& shader = shaders[stage + 1];
+ Shader* const shader = shaders[stage + 1];
if (!shader) {
continue;
}
@@ -858,10 +918,10 @@ void RasterizerVulkan::BeginTransformFeedback() {
UNIMPLEMENTED_IF(binding.buffer_offset != 0);
const GPUVAddr gpu_addr = binding.Address();
- const std::size_t size = binding.buffer_size;
- const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+ const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
+ const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
- scheduler.Record([buffer = buffer, offset = offset, size](vk::CommandBuffer cmdbuf) {
+ scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
});
@@ -913,8 +973,8 @@ void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex
buffer_bindings.AddVertexBinding(DefaultBuffer(), 0);
continue;
}
- const auto [buffer, offset] = buffer_cache.UploadMemory(start, size);
- buffer_bindings.AddVertexBinding(buffer, offset);
+ const auto info = buffer_cache.UploadMemory(start, size);
+ buffer_bindings.AddVertexBinding(info.handle, info.offset);
}
}
@@ -936,7 +996,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
break;
}
const GPUVAddr gpu_addr = regs.index_array.IndexStart();
- auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+ const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+ VkBuffer buffer = info.handle;
+ u64 offset = info.offset;
std::tie(buffer, offset) = quad_indexed_pass.Assemble(
regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
@@ -950,7 +1012,9 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar
break;
}
const GPUVAddr gpu_addr = regs.index_array.IndexStart();
- auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+ const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
+ VkBuffer buffer = info.handle;
+ u64 offset = info.offset;
auto format = regs.index_array.format;
const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
@@ -1097,10 +1161,9 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
ASSERT(size <= MaxConstbufferSize);
- const auto [buffer_handle, offset] =
+ const auto info =
buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
-
- update_descriptor_queue.AddBuffer(buffer_handle, offset, size);
+ update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
}
void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
@@ -1114,14 +1177,14 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd
// Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
// default buffer.
static constexpr std::size_t dummy_size = 4;
- const auto buffer = buffer_cache.GetEmptyBuffer(dummy_size);
- update_descriptor_queue.AddBuffer(buffer, 0, dummy_size);
+ const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
+ update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
return;
}
- const auto [buffer, offset] = buffer_cache.UploadMemory(
+ const auto info = buffer_cache.UploadMemory(
actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
- update_descriptor_queue.AddBuffer(buffer, offset, size);
+ update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
}
void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic,
@@ -1137,12 +1200,12 @@ void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& textu
auto view = texture_cache.GetTextureSurface(texture.tic, entry);
ASSERT(!view->IsBufferView());
- const auto image_view = view->GetHandle(texture.tic.x_source, texture.tic.y_source,
- texture.tic.z_source, texture.tic.w_source);
+ const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source,
+ texture.tic.z_source, texture.tic.w_source);
const auto sampler = sampler_cache.GetSampler(texture.tsc);
update_descriptor_queue.AddSampledImage(sampler, image_view);
- const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+ VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
*image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
sampled_views.push_back(ImageView{std::move(view), image_layout});
}
@@ -1164,10 +1227,11 @@ void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const Ima
UNIMPLEMENTED_IF(tic.IsBuffer());
- const auto image_view = view->GetHandle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+ const VkImageView image_view =
+ view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
update_descriptor_queue.AddImage(image_view);
- const auto image_layout = update_descriptor_queue.GetLastImageLayout();
+ VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout();
*image_layout = VK_IMAGE_LAYOUT_GENERAL;
image_views.push_back(ImageView{std::move(view), image_layout});
}
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 04be37a5e..83e00e7e9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -159,7 +159,10 @@ private:
void FlushWork();
- Texceptions UpdateAttachments();
+ /// @brief Updates the currently bound attachments
+ /// @param is_clear True when the framebuffer is updated as a clear
+ /// @return Bitfield of attachments being used as sampled textures
+ Texceptions UpdateAttachments(bool is_clear);
std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass);
@@ -168,7 +171,7 @@ private:
bool is_indexed, bool is_instanced);
/// Setup descriptors in the graphics pipeline.
- void SetupShaderDescriptors(const std::array<Shader, Maxwell::MaxShaderProgram>& shaders);
+ void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders);
void SetupImageTransitions(Texceptions texceptions,
const std::array<View, Maxwell::NumRenderTargets>& color_attachments,
diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
index e6f2fa553..616eacc36 100644
--- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp
@@ -9,6 +9,8 @@
#include "video_core/renderer_vulkan/wrapper.h"
#include "video_core/textures/texture.h"
+using Tegra::Texture::TextureMipmapFilter;
+
namespace Vulkan {
namespace {
@@ -63,8 +65,8 @@ vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) c
ci.maxAnisotropy = tsc.GetMaxAnisotropy();
ci.compareEnable = tsc.depth_compare_enabled;
ci.compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func);
- ci.minLod = tsc.GetMinLod();
- ci.maxLod = tsc.GetMaxLod();
+ ci.minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod();
+ ci.maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod();
ci.borderColor = arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color);
ci.unnormalizedCoordinates = VK_FALSE;
return device.GetLogical().CreateSampler(ci);
diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp
index 82ec9180e..56524e6f3 100644
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -9,6 +9,7 @@
#include <utility>
#include "common/microprofile.h"
+#include "common/thread.h"
#include "video_core/renderer_vulkan/vk_device.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -133,6 +134,7 @@ void VKScheduler::BindGraphicsPipeline(VkPipeline pipeline) {
}
void VKScheduler::WorkerThread() {
+ Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
std::unique_lock lock{mutex};
do {
cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h
index dfddf7ad6..689f0d276 100644
--- a/src/video_core/renderer_vulkan/vk_stream_buffer.h
+++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h
@@ -35,10 +35,14 @@ public:
/// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy.
void Unmap(u64 size);
- VkBuffer GetHandle() const {
+ VkBuffer Handle() const noexcept {
return *buffer;
}
+ u64 Address() const noexcept {
+ return 0;
+ }
+
private:
struct Watch final {
VKFenceWatch fence;
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index ea487b770..430031665 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -167,6 +167,7 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
ci.extent = {params.width, params.height, 1};
break;
case SurfaceTarget::Texture3D:
+ ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
ci.extent = {params.width, params.height, params.depth};
break;
case SurfaceTarget::TextureBuffer:
@@ -176,6 +177,12 @@ VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceP
return ci;
}
+u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source,
+ Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) {
+ return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+ (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
} // Anonymous namespace
CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
@@ -203,9 +210,11 @@ CachedSurface::CachedSurface(Core::System& system, const VKDevice& device,
}
// TODO(Rodrigo): Move this to a virtual function.
- main_view = CreateViewInner(
- ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels),
- true);
+ u32 num_layers = 1;
+ if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+ num_layers = params.depth;
+ }
+ main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels));
}
CachedSurface::~CachedSurface() = default;
@@ -253,12 +262,8 @@ void CachedSurface::DecorateSurfaceName() {
}
View CachedSurface::CreateView(const ViewParams& params) {
- return CreateViewInner(params, false);
-}
-
-View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) {
// TODO(Rodrigo): Add name decorations
- return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy);
+ return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params);
}
void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) {
@@ -342,18 +347,27 @@ VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const {
}
CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
- const ViewParams& params, bool is_proxy)
+ const ViewParams& params)
: VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()},
image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()},
aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface},
- base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level},
- num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target)
- : VK_IMAGE_VIEW_TYPE_1D} {}
+ base_level{params.base_level}, num_levels{params.num_levels},
+ image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} {
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ base_layer = 0;
+ num_layers = 1;
+ base_slice = params.base_layer;
+ num_slices = params.num_layers;
+ } else {
+ base_layer = params.base_layer;
+ num_layers = params.num_layers;
+ }
+}
CachedSurfaceView::~CachedSurfaceView() = default;
-VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source,
- SwizzleSource z_source, SwizzleSource w_source) {
+VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source,
+ SwizzleSource z_source, SwizzleSource w_source) {
const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
if (last_image_view && last_swizzle == new_swizzle) {
return last_image_view;
@@ -399,6 +413,11 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
});
}
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ ASSERT(base_slice == 0);
+ ASSERT(num_slices == params.depth);
+ }
+
VkImageViewCreateInfo ci;
ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
ci.pNext = nullptr;
@@ -417,6 +436,35 @@ VkImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y
return last_image_view = *image_view;
}
+VkImageView CachedSurfaceView::GetAttachment() {
+ if (render_target) {
+ return *render_target;
+ }
+
+ VkImageViewCreateInfo ci;
+ ci.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+ ci.pNext = nullptr;
+ ci.flags = 0;
+ ci.image = surface.GetImageHandle();
+ ci.format = surface.GetImage().GetFormat();
+ ci.components = {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
+ VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY};
+ ci.subresourceRange.aspectMask = aspect_mask;
+ ci.subresourceRange.baseMipLevel = base_level;
+ ci.subresourceRange.levelCount = num_levels;
+ if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) {
+ ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D;
+ ci.subresourceRange.baseArrayLayer = base_slice;
+ ci.subresourceRange.layerCount = num_slices;
+ } else {
+ ci.viewType = image_view_type;
+ ci.subresourceRange.baseArrayLayer = base_layer;
+ ci.subresourceRange.layerCount = num_layers;
+ }
+ render_target = device.GetLogical().CreateImageView(ci);
+ return *render_target;
+}
+
VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
const VKDevice& device, VKResourceManager& resource_manager,
VKMemoryManager& memory_manager, VKScheduler& scheduler,
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index f211ccb1e..807e26c8a 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -91,7 +91,6 @@ protected:
void DecorateSurfaceName();
View CreateView(const ViewParams& params) override;
- View CreateViewInner(const ViewParams& params, bool is_proxy);
private:
void UploadBuffer(const std::vector<u8>& staging_buffer);
@@ -120,23 +119,20 @@ private:
class CachedSurfaceView final : public VideoCommon::ViewBase {
public:
explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface,
- const ViewParams& params, bool is_proxy);
+ const ViewParams& params);
~CachedSurfaceView();
- VkImageView GetHandle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source);
+ VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source,
+ Tegra::Texture::SwizzleSource y_source,
+ Tegra::Texture::SwizzleSource z_source,
+ Tegra::Texture::SwizzleSource w_source);
+
+ VkImageView GetAttachment();
bool IsSameSurface(const CachedSurfaceView& rhs) const {
return &surface == &rhs.surface;
}
- VkImageView GetHandle() {
- return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G,
- Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A);
- }
-
u32 GetWidth() const {
return params.GetMipWidth(base_level);
}
@@ -180,14 +176,6 @@ public:
}
private:
- static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source,
- Tegra::Texture::SwizzleSource y_source,
- Tegra::Texture::SwizzleSource z_source,
- Tegra::Texture::SwizzleSource w_source) {
- return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
- (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
- }
-
// Store a copy of these values to avoid double dereference when reading them
const SurfaceParams params;
const VkImage image;
@@ -196,15 +184,18 @@ private:
const VKDevice& device;
CachedSurface& surface;
- const u32 base_layer;
- const u32 num_layers;
const u32 base_level;
const u32 num_levels;
const VkImageViewType image_view_type;
+ u32 base_layer = 0;
+ u32 num_layers = 0;
+ u32 base_slice = 0;
+ u32 num_slices = 0;
VkImageView last_image_view = nullptr;
u32 last_swizzle = 0;
+ vk::ImageView render_target;
std::unordered_map<u32, vk::ImageView> view_cache;
};
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
index 681ecde98..351c048d2 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -24,35 +24,25 @@ void VKUpdateDescriptorQueue::TickFrame() {
}
void VKUpdateDescriptorQueue::Acquire() {
- entries.clear();
-}
+ // Minimum number of entries required.
+ // This is the maximum number of entries a single draw call migth use.
+ static constexpr std::size_t MIN_ENTRIES = 0x400;
-void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
- VkDescriptorSet set) {
- if (payload.size() + entries.size() >= payload.max_size()) {
+ if (payload.size() + MIN_ENTRIES >= payload.max_size()) {
LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
scheduler.WaitWorker();
payload.clear();
}
+ upload_start = &*payload.end();
+}
- // TODO(Rodrigo): Rework to write the payload directly
- const auto payload_start = payload.data() + payload.size();
- for (const auto& entry : entries) {
- if (const auto image = std::get_if<VkDescriptorImageInfo>(&entry)) {
- payload.push_back(*image);
- } else if (const auto buffer = std::get_if<VkDescriptorBufferInfo>(&entry)) {
- payload.push_back(*buffer);
- } else if (const auto texel = std::get_if<VkBufferView>(&entry)) {
- payload.push_back(*texel);
- } else {
- UNREACHABLE();
- }
- }
-
- scheduler.Record(
- [payload_start, set, update_template, logical = &device.GetLogical()](vk::CommandBuffer) {
- logical->UpdateDescriptorSet(set, update_template, payload_start);
- });
+void VKUpdateDescriptorQueue::Send(VkDescriptorUpdateTemplateKHR update_template,
+ VkDescriptorSet set) {
+ const void* const data = upload_start;
+ const vk::Device* const logical = &device.GetLogical();
+ scheduler.Record([data, logical, set, update_template](vk::CommandBuffer) {
+ logical->UpdateDescriptorSet(set, update_template, data);
+ });
}
} // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h
index cc7e3dff4..945320c72 100644
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -15,17 +15,13 @@ namespace Vulkan {
class VKDevice;
class VKScheduler;
-class DescriptorUpdateEntry {
-public:
- explicit DescriptorUpdateEntry() {}
-
- DescriptorUpdateEntry(VkDescriptorImageInfo image) : image{image} {}
+struct DescriptorUpdateEntry {
+ DescriptorUpdateEntry(VkDescriptorImageInfo image_) : image{image_} {}
- DescriptorUpdateEntry(VkDescriptorBufferInfo buffer) : buffer{buffer} {}
+ DescriptorUpdateEntry(VkDescriptorBufferInfo buffer_) : buffer{buffer_} {}
- DescriptorUpdateEntry(VkBufferView texel_buffer) : texel_buffer{texel_buffer} {}
+ DescriptorUpdateEntry(VkBufferView texel_buffer_) : texel_buffer{texel_buffer_} {}
-private:
union {
VkDescriptorImageInfo image;
VkDescriptorBufferInfo buffer;
@@ -45,32 +41,34 @@ public:
void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set);
void AddSampledImage(VkSampler sampler, VkImageView image_view) {
- entries.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
+ payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}});
}
void AddImage(VkImageView image_view) {
- entries.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
+ payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}});
}
void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) {
- entries.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
+ payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size});
}
void AddTexelBuffer(VkBufferView texel_buffer) {
- entries.emplace_back(texel_buffer);
+ payload.emplace_back(texel_buffer);
}
- VkImageLayout* GetLastImageLayout() {
- return &std::get<VkDescriptorImageInfo>(entries.back()).imageLayout;
+ VkImageLayout* LastImageLayout() {
+ return &payload.back().image.imageLayout;
}
-private:
- using Variant = std::variant<VkDescriptorImageInfo, VkDescriptorBufferInfo, VkBufferView>;
+ const VkImageLayout* LastImageLayout() const {
+ return &payload.back().image.imageLayout;
+ }
+private:
const VKDevice& device;
VKScheduler& scheduler;
- boost::container::static_vector<Variant, 0x400> entries;
+ const DescriptorUpdateEntry* upload_start = nullptr;
boost::container::static_vector<DescriptorUpdateEntry, 0x10000> payload;
};
diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/renderer_vulkan/wrapper.cpp
index 2ce9b0626..0d485a662 100644
--- a/src/video_core/renderer_vulkan/wrapper.cpp
+++ b/src/video_core/renderer_vulkan/wrapper.cpp
@@ -153,7 +153,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
bool Load(InstanceDispatch& dld) noexcept {
#define X(name) Proc(dld.name, dld, #name)
- return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties);
+ return X(vkCreateInstance) && X(vkEnumerateInstanceExtensionProperties) &&
+ X(vkEnumerateInstanceLayerProperties);
#undef X
}
@@ -725,8 +726,7 @@ bool PhysicalDevice::GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR s
return supported == VK_TRUE;
}
-VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const
- noexcept {
+VkSurfaceCapabilitiesKHR PhysicalDevice::GetSurfaceCapabilitiesKHR(VkSurfaceKHR surface) const {
VkSurfaceCapabilitiesKHR capabilities;
Check(dld->vkGetPhysicalDeviceSurfaceCapabilitiesKHR(physical_device, surface, &capabilities));
return capabilities;
@@ -771,4 +771,17 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp
return properties;
}
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+ const InstanceDispatch& dld) {
+ u32 num;
+ if (dld.vkEnumerateInstanceLayerProperties(&num, nullptr) != VK_SUCCESS) {
+ return std::nullopt;
+ }
+ std::vector<VkLayerProperties> properties(num);
+ if (dld.vkEnumerateInstanceLayerProperties(&num, properties.data()) != VK_SUCCESS) {
+ return std::nullopt;
+ }
+ return properties;
+}
+
} // namespace Vulkan::vk
diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/renderer_vulkan/wrapper.h
index 98937a77a..d56fdb3f9 100644
--- a/src/video_core/renderer_vulkan/wrapper.h
+++ b/src/video_core/renderer_vulkan/wrapper.h
@@ -141,6 +141,7 @@ struct InstanceDispatch {
PFN_vkCreateInstance vkCreateInstance;
PFN_vkDestroyInstance vkDestroyInstance;
PFN_vkEnumerateInstanceExtensionProperties vkEnumerateInstanceExtensionProperties;
+ PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties;
PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT;
PFN_vkCreateDevice vkCreateDevice;
@@ -779,7 +780,7 @@ public:
bool GetSurfaceSupportKHR(u32 queue_family_index, VkSurfaceKHR) const;
- VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const noexcept;
+ VkSurfaceCapabilitiesKHR GetSurfaceCapabilitiesKHR(VkSurfaceKHR) const;
std::vector<VkSurfaceFormatKHR> GetSurfaceFormatsKHR(VkSurfaceKHR) const;
@@ -996,4 +997,7 @@ private:
std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties(
const InstanceDispatch& dld);
+std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties(
+ const InstanceDispatch& dld);
+
} // namespace Vulkan::vk
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 848e46874..b2e88fa20 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -13,55 +13,101 @@
namespace VideoCommon::Shader {
+using std::move;
using Tegra::Shader::Instruction;
using Tegra::Shader::OpCode;
+using Tegra::Shader::PredCondition;
u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
- if (instr.hset2.ftz == 0) {
- LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+ PredCondition cond;
+ bool bf;
+ bool ftz;
+ bool neg_a;
+ bool abs_a;
+ bool neg_b;
+ bool abs_b;
+ switch (opcode->get().GetId()) {
+ case OpCode::Id::HSET2_C:
+ case OpCode::Id::HSET2_IMM:
+ cond = instr.hsetp2.cbuf_and_imm.cond;
+ bf = instr.Bit(53);
+ ftz = instr.Bit(54);
+ neg_a = instr.Bit(43);
+ abs_a = instr.Bit(44);
+ neg_b = instr.Bit(56);
+ abs_b = instr.Bit(54);
+ break;
+ case OpCode::Id::HSET2_R:
+ cond = instr.hsetp2.reg.cond;
+ bf = instr.Bit(49);
+ ftz = instr.Bit(50);
+ neg_a = instr.Bit(43);
+ abs_a = instr.Bit(44);
+ neg_b = instr.Bit(31);
+ abs_b = instr.Bit(30);
+ break;
+ default:
+ UNREACHABLE();
}
- Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
- op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
-
- Node op_b = [&]() {
+ Node op_b = [this, instr, opcode] {
switch (opcode->get().GetId()) {
+ case OpCode::Id::HSET2_C:
+ // Inform as unimplemented as this is not tested.
+ UNIMPLEMENTED_MSG("HSET2_C is not implemented");
+ return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
case OpCode::Id::HSET2_R:
return GetRegister(instr.gpr20);
+ case OpCode::Id::HSET2_IMM:
+ return UnpackHalfImmediate(instr, true);
default:
UNREACHABLE();
- return Immediate(0);
+ return Node{};
}
}();
- op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
- op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
- const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+ if (!ftz) {
+ LOG_DEBUG(HW_GPU, "{} without FTZ is not implemented", opcode->get().GetName());
+ }
+
+ Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
+ op_a = GetOperandAbsNegHalf(op_a, abs_a, neg_a);
+
+ switch (opcode->get().GetId()) {
+ case OpCode::Id::HSET2_R:
+ op_b = GetOperandAbsNegHalf(move(op_b), abs_b, neg_b);
+ [[fallthrough]];
+ case OpCode::Id::HSET2_C:
+ op_b = UnpackHalfFloat(move(op_b), instr.hset2.type_b);
+ break;
+ default:
+ break;
+ }
- const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
+ Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
+
+ Node comparison_pair = GetPredicateComparisonHalf(cond, op_a, op_b);
const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
// HSET2 operates on each half float in the pack.
std::array<Node, 2> values;
for (u32 i = 0; i < 2; ++i) {
- const u32 raw_value = instr.hset2.bf ? 0x3c00 : 0xffff;
- const Node true_value = Immediate(raw_value << (i * 16));
- const Node false_value = Immediate(0);
-
- const Node comparison =
- Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
- const Node predicate = Operation(combiner, comparison, second_pred);
+ const u32 raw_value = bf ? 0x3c00 : 0xffff;
+ Node true_value = Immediate(raw_value << (i * 16));
+ Node false_value = Immediate(0);
+ Node comparison = Operation(OperationCode::LogicalPick2, comparison_pair, Immediate(i));
+ Node predicate = Operation(combiner, comparison, second_pred);
values[i] =
- Operation(OperationCode::Select, NO_PRECISE, predicate, true_value, false_value);
+ Operation(OperationCode::Select, predicate, move(true_value), move(false_value));
}
- const Node value = Operation(OperationCode::UBitwiseOr, NO_PRECISE, values[0], values[1]);
- SetRegister(bb, instr.gpr0, value);
+ Node value = Operation(OperationCode::UBitwiseOr, values[0], values[1]);
+ SetRegister(bb, instr.gpr0, move(value));
return pc;
}
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 60b6ad72a..07778dc3e 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -97,6 +97,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
break;
case TextureFormat::B5G6R5:
case TextureFormat::B6G5R5:
+ case TextureFormat::BF10GF11RF11:
if (component == 0) {
return descriptor.b_type;
}
@@ -119,7 +120,7 @@ ComponentType GetComponentType(Tegra::Engines::SamplerDescriptor descriptor,
}
break;
}
- UNIMPLEMENTED_MSG("texture format not implement={}", format);
+ UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
return ComponentType::FLOAT;
}
@@ -191,6 +192,14 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
return 6;
}
return 0;
+ case TextureFormat::BF10GF11RF11:
+ if (component == 1 || component == 2) {
+ return 11;
+ }
+ if (component == 0) {
+ return 10;
+ }
+ return 0;
case TextureFormat::G8R24:
if (component == 0) {
return 8;
@@ -211,10 +220,9 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) {
return (component == 0 || component == 1) ? 8 : 0;
case TextureFormat::G4R4:
return (component == 0 || component == 1) ? 4 : 0;
- default:
- UNIMPLEMENTED_MSG("texture format not implement={}", format);
- return 0;
}
+ UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+ return 0;
}
std::size_t GetImageComponentMask(TextureFormat format) {
@@ -235,6 +243,7 @@ std::size_t GetImageComponentMask(TextureFormat format) {
case TextureFormat::R32_B24G8:
case TextureFormat::B5G6R5:
case TextureFormat::B6G5R5:
+ case TextureFormat::BF10GF11RF11:
return std::size_t{R | G | B};
case TextureFormat::R32_G32:
case TextureFormat::R16_G16:
@@ -248,10 +257,9 @@ std::size_t GetImageComponentMask(TextureFormat format) {
case TextureFormat::R8:
case TextureFormat::R1:
return std::size_t{R};
- default:
- UNIMPLEMENTED_MSG("texture format not implement={}", format);
- return std::size_t{R | G | B | A};
}
+ UNIMPLEMENTED_MSG("Texture format not implemented={}", format);
+ return std::size_t{R | G | B | A};
}
std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
@@ -299,7 +307,7 @@ std::pair<Node, bool> ShaderIR::GetComponentValue(ComponentType component_type,
return {std::move(original_value), true};
}
default:
- UNIMPLEMENTED_MSG("Unimplement component type={}", component_type);
+ UNIMPLEMENTED_MSG("Unimplemented component type={}", component_type);
return {std::move(original_value), true};
}
}
@@ -459,7 +467,7 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
default:
break;
}
- UNIMPLEMENTED_MSG("Unimplemented operation={} type={}",
+ UNIMPLEMENTED_MSG("Unimplemented operation={}, type={}",
static_cast<u64>(instr.suatom_d.operation.Value()),
static_cast<u64>(instr.suatom_d.operation_type.Value()));
return OperationCode::AtomicImageAdd;
diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp
index 8f0bb996e..29ebf65ba 100644
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -357,13 +357,11 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
return pc;
}
-ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
- std::optional<u32> buffer) {
+ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(
+ SamplerInfo info, std::optional<Tegra::Engines::SamplerDescriptor> sampler) {
if (info.IsComplete()) {
return info;
}
- const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
- : registry.ObtainBoundSampler(offset);
if (!sampler) {
LOG_WARNING(HW_GPU, "Unknown sampler info");
info.type = info.type.value_or(Tegra::Shader::TextureType::Texture2D);
@@ -381,8 +379,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(SamplerInfo info, u32 offset,
std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler,
SamplerInfo sampler_info) {
- const auto offset = static_cast<u32>(sampler.index.Value());
- const auto info = GetSamplerInfo(sampler_info, offset);
+ const u32 offset = static_cast<u32>(sampler.index.Value());
+ const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset));
// If this sampler has already been used, return the existing mapping.
const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
@@ -404,20 +402,19 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
const Node sampler_register = GetRegister(reg);
const auto [base_node, tracked_sampler_info] =
TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size()));
- ASSERT(base_node != nullptr);
- if (base_node == nullptr) {
+ if (!base_node) {
+ UNREACHABLE();
return std::nullopt;
}
- if (const auto bindless_sampler_info =
- std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
- const u32 buffer = bindless_sampler_info->GetIndex();
- const u32 offset = bindless_sampler_info->GetOffset();
- info = GetSamplerInfo(info, offset, buffer);
+ if (const auto sampler_info = std::get_if<BindlessSamplerNode>(&*tracked_sampler_info)) {
+ const u32 buffer = sampler_info->index;
+ const u32 offset = sampler_info->offset;
+ info = GetSamplerInfo(info, registry.ObtainBindlessSampler(buffer, offset));
// If this sampler has already been used, return the existing mapping.
const auto it = std::find_if(used_samplers.begin(), used_samplers.end(),
- [buffer = buffer, offset = offset](const Sampler& entry) {
+ [buffer, offset](const Sampler& entry) {
return entry.buffer == buffer && entry.offset == offset;
});
if (it != used_samplers.end()) {
@@ -431,10 +428,32 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
return used_samplers.emplace_back(next_index, offset, buffer, *info.type, *info.is_array,
*info.is_shadow, *info.is_buffer, false);
}
- if (const auto array_sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
- const u32 base_offset = array_sampler_info->GetBaseOffset() / 4;
- index_var = GetCustomVariable(array_sampler_info->GetIndexVar());
- info = GetSamplerInfo(info, base_offset);
+ if (const auto sampler_info = std::get_if<SeparateSamplerNode>(&*tracked_sampler_info)) {
+ const std::pair indices = sampler_info->indices;
+ const std::pair offsets = sampler_info->offsets;
+ info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets));
+
+ // Try to use an already created sampler if it exists
+ const auto it = std::find_if(
+ used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) {
+ return offsets == std::pair{entry.offset, entry.secondary_offset} &&
+ indices == std::pair{entry.buffer, entry.secondary_buffer};
+ });
+ if (it != used_samplers.end()) {
+ ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array &&
+ it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer);
+ return *it;
+ }
+
+ // Otherwise create a new mapping for this sampler
+ const u32 next_index = static_cast<u32>(used_samplers.size());
+ return used_samplers.emplace_back(next_index, offsets, indices, *info.type, *info.is_array,
+ *info.is_shadow, *info.is_buffer);
+ }
+ if (const auto sampler_info = std::get_if<ArraySamplerNode>(&*tracked_sampler_info)) {
+ const u32 base_offset = sampler_info->base_offset / 4;
+ index_var = GetCustomVariable(sampler_info->bindless_var);
+ info = GetSamplerInfo(info, registry.ObtainBoundSampler(base_offset));
// If this sampler has already been used, return the existing mapping.
const auto it = std::find_if(
diff --git a/src/video_core/shader/memory_util.cpp b/src/video_core/shader/memory_util.cpp
index 074f21691..5071c83ca 100644
--- a/src/video_core/shader/memory_util.cpp
+++ b/src/video_core/shader/memory_util.cpp
@@ -66,12 +66,12 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, GPUVAddr gpu_add
u64 GetUniqueIdentifier(Tegra::Engines::ShaderType shader_type, bool is_a, const ProgramCode& code,
const ProgramCode& code_b) {
- u64 unique_identifier = boost::hash_value(code);
+ size_t unique_identifier = boost::hash_value(code);
if (is_a) {
// VertexA programs include two programs
boost::hash_combine(unique_identifier, boost::hash_value(code_b));
}
- return unique_identifier;
+ return static_cast<u64>(unique_identifier);
}
} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index c5e5165ff..8f230d57a 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -275,10 +275,11 @@ using Node = std::shared_ptr<NodeData>;
using Node4 = std::array<Node, 4>;
using NodeBlock = std::vector<Node>;
-class BindlessSamplerNode;
-class ArraySamplerNode;
+struct ArraySamplerNode;
+struct BindlessSamplerNode;
+struct SeparateSamplerNode;
-using TrackSamplerData = std::variant<BindlessSamplerNode, ArraySamplerNode>;
+using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>;
using TrackSampler = std::shared_ptr<TrackSamplerData>;
struct Sampler {
@@ -288,63 +289,51 @@ struct Sampler {
: index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow},
is_buffer{is_buffer}, is_indexed{is_indexed} {}
+ /// Separate sampler constructor
+ constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers,
+ Tegra::Shader::TextureType type, bool is_array, bool is_shadow,
+ bool is_buffer)
+ : index{index}, offset{offsets.first}, secondary_offset{offsets.second},
+ buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array},
+ is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {}
+
/// Bindless samplers constructor
constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type,
bool is_array, bool is_shadow, bool is_buffer, bool is_indexed)
: index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array},
is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {}
- u32 index = 0; ///< Emulated index given for the this sampler.
- u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
- u32 buffer = 0; ///< Buffer where the bindless sampler is being read (unused on bound samplers).
- u32 size = 1; ///< Size of the sampler.
+ u32 index = 0; ///< Emulated index given for the this sampler.
+ u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read.
+ u32 secondary_offset = 0; ///< Secondary offset in the const buffer.
+ u32 buffer = 0; ///< Buffer where the bindless sampler is read.
+ u32 secondary_buffer = 0; ///< Secondary buffer where the bindless sampler is read.
+ u32 size = 1; ///< Size of the sampler.
Tegra::Shader::TextureType type{}; ///< The type used to sample this texture (Texture2D, etc)
- bool is_array = false; ///< Whether the texture is being sampled as an array texture or not.
- bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not.
- bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler.
- bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
- bool is_indexed = false; ///< Whether this sampler is an indexed array of textures.
+ bool is_array = false; ///< Whether the texture is being sampled as an array texture or not.
+ bool is_shadow = false; ///< Whether the texture is being sampled as a depth texture or not.
+ bool is_buffer = false; ///< Whether the texture is a texture buffer without sampler.
+ bool is_bindless = false; ///< Whether this sampler belongs to a bindless texture or not.
+ bool is_indexed = false; ///< Whether this sampler is an indexed array of textures.
+ bool is_separated = false; ///< Whether the image and sampler is separated or not.
};
/// Represents a tracked bindless sampler into a direct const buffer
-class ArraySamplerNode final {
-public:
- explicit ArraySamplerNode(u32 index, u32 base_offset, u32 bindless_var)
- : index{index}, base_offset{base_offset}, bindless_var{bindless_var} {}
-
- constexpr u32 GetIndex() const {
- return index;
- }
-
- constexpr u32 GetBaseOffset() const {
- return base_offset;
- }
-
- constexpr u32 GetIndexVar() const {
- return bindless_var;
- }
-
-private:
+struct ArraySamplerNode {
u32 index;
u32 base_offset;
u32 bindless_var;
};
-/// Represents a tracked bindless sampler into a direct const buffer
-class BindlessSamplerNode final {
-public:
- explicit BindlessSamplerNode(u32 index, u32 offset) : index{index}, offset{offset} {}
-
- constexpr u32 GetIndex() const {
- return index;
- }
-
- constexpr u32 GetOffset() const {
- return offset;
- }
+/// Represents a tracked separate sampler image pair that was folded statically
+struct SeparateSamplerNode {
+ std::pair<u32, u32> indices;
+ std::pair<u32, u32> offsets;
+};
-private:
+/// Represents a tracked bindless sampler into a direct const buffer
+struct BindlessSamplerNode {
u32 index;
u32 offset;
};
diff --git a/src/video_core/shader/node_helper.h b/src/video_core/shader/node_helper.h
index 11231bbea..1e0886185 100644
--- a/src/video_core/shader/node_helper.h
+++ b/src/video_core/shader/node_helper.h
@@ -48,7 +48,7 @@ Node MakeNode(Args&&... args) {
template <typename T, typename... Args>
TrackSampler MakeTrackSampler(Args&&... args) {
static_assert(std::is_convertible_v<T, TrackSamplerData>);
- return std::make_shared<TrackSamplerData>(T(std::forward<Args>(args)...));
+ return std::make_shared<TrackSamplerData>(T{std::forward<Args>(args)...});
}
template <typename... Args>
diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp
index af70b3f35..cdf274e54 100644
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -93,6 +93,26 @@ std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
return value;
}
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler(
+ std::pair<u32, u32> buffers, std::pair<u32, u32> offsets) {
+ SeparateSamplerKey key;
+ key.buffers = buffers;
+ key.offsets = offsets;
+ const auto iter = separate_samplers.find(key);
+ if (iter != separate_samplers.end()) {
+ return iter->second;
+ }
+ if (!engine) {
+ return std::nullopt;
+ }
+
+ const u32 handle_1 = engine->AccessConstBuffer32(stage, key.buffers.first, key.offsets.first);
+ const u32 handle_2 = engine->AccessConstBuffer32(stage, key.buffers.second, key.offsets.second);
+ const SamplerDescriptor value = engine->AccessSampler(handle_1 | handle_2);
+ separate_samplers.emplace(key, value);
+ return value;
+}
+
std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
u32 offset) {
const std::pair key = {buffer, offset};
diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h
index 0c80d35fd..231206765 100644
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -19,8 +19,39 @@
namespace VideoCommon::Shader {
+struct SeparateSamplerKey {
+ std::pair<u32, u32> buffers;
+ std::pair<u32, u32> offsets;
+};
+
+} // namespace VideoCommon::Shader
+
+namespace std {
+
+template <>
+struct hash<VideoCommon::Shader::SeparateSamplerKey> {
+ std::size_t operator()(const VideoCommon::Shader::SeparateSamplerKey& key) const noexcept {
+ return std::hash<u32>{}(key.buffers.first ^ key.buffers.second ^ key.offsets.first ^
+ key.offsets.second);
+ }
+};
+
+template <>
+struct equal_to<VideoCommon::Shader::SeparateSamplerKey> {
+ bool operator()(const VideoCommon::Shader::SeparateSamplerKey& lhs,
+ const VideoCommon::Shader::SeparateSamplerKey& rhs) const noexcept {
+ return lhs.buffers == rhs.buffers && lhs.offsets == rhs.offsets;
+ }
+};
+
+} // namespace std
+
+namespace VideoCommon::Shader {
+
using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using SeparateSamplerMap =
+ std::unordered_map<SeparateSamplerKey, Tegra::Engines::SamplerDescriptor>;
using BindlessSamplerMap =
std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
@@ -73,6 +104,9 @@ public:
std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
+ std::optional<Tegra::Engines::SamplerDescriptor> ObtainSeparateSampler(
+ std::pair<u32, u32> buffers, std::pair<u32, u32> offsets);
+
std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
/// Inserts a key.
@@ -128,6 +162,7 @@ private:
Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
KeyMap keys;
BoundSamplerMap bound_samplers;
+ SeparateSamplerMap separate_samplers;
BindlessSamplerMap bindless_samplers;
u32 bound_buffer;
GraphicsInfo graphics_info;
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 15ae152f2..3a98b2104 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -330,8 +330,8 @@ private:
OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
/// Queries the missing sampler info from the execution context.
- SamplerInfo GetSamplerInfo(SamplerInfo info, u32 offset,
- std::optional<u32> buffer = std::nullopt);
+ SamplerInfo GetSamplerInfo(SamplerInfo info,
+ std::optional<Tegra::Engines::SamplerDescriptor> sampler);
/// Accesses a texture sampler.
std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info);
@@ -409,8 +409,14 @@ private:
std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
- std::tuple<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
- s64 cursor);
+ std::pair<Node, TrackSampler> TrackBindlessSampler(Node tracked, const NodeBlock& code,
+ s64 cursor);
+
+ std::pair<Node, TrackSampler> HandleBindlessIndirectRead(const CbufNode& cbuf,
+ const OperationNode& operation,
+ Node gpr, Node base_offset,
+ Node tracked, const NodeBlock& code,
+ s64 cursor);
std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
index eb97bfd41..d5ed81442 100644
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -14,6 +14,7 @@
namespace VideoCommon::Shader {
namespace {
+
std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
OperationCode operation_code) {
for (; cursor >= 0; --cursor) {
@@ -63,7 +64,8 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
if (const auto operation = std::get_if<OperationNode>(&*node)) {
operation->SetAmendIndex(amend_index);
return true;
- } else if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
+ }
+ if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
conditional->SetAmendIndex(amend_index);
return true;
}
@@ -72,40 +74,27 @@ bool AmendNodeCv(std::size_t amend_index, Node node) {
} // Anonymous namespace
-std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
- s64 cursor) {
+std::pair<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, const NodeBlock& code,
+ s64 cursor) {
if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
+ const u32 cbuf_index = cbuf->GetIndex();
+
// Constant buffer found, test if it's an immediate
const auto& offset = cbuf->GetOffset();
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
- auto track =
- MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
+ auto track = MakeTrackSampler<BindlessSamplerNode>(cbuf_index, immediate->GetValue());
return {tracked, track};
}
if (const auto operation = std::get_if<OperationNode>(&*offset)) {
const u32 bound_buffer = registry.GetBoundBuffer();
- if (bound_buffer != cbuf->GetIndex()) {
+ if (bound_buffer != cbuf_index) {
return {};
}
- const auto pair = DecoupleIndirectRead(*operation);
- if (!pair) {
- return {};
+ if (const std::optional pair = DecoupleIndirectRead(*operation)) {
+ auto [gpr, base_offset] = *pair;
+ return HandleBindlessIndirectRead(*cbuf, *operation, gpr, base_offset, tracked,
+ code, cursor);
}
- auto [gpr, base_offset] = *pair;
- const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
- const auto& gpu_driver = registry.AccessGuestDriverProfile();
- const u32 bindless_cv = NewCustomVariable();
- Node op =
- Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));
-
- const Node cv_node = GetCustomVariable(bindless_cv);
- Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
- const std::size_t amend_index = DeclareAmend(std::move(amend_op));
- AmendNodeCv(amend_index, code[cursor]);
- // TODO Implement Bindless Index custom variable
- auto track = MakeTrackSampler<ArraySamplerNode>(cbuf->GetIndex(),
- offset_inm->GetValue(), bindless_cv);
- return {tracked, track};
}
return {};
}
@@ -122,10 +111,23 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
return TrackBindlessSampler(source, code, new_cursor);
}
if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
- for (std::size_t i = operation->GetOperandsCount(); i > 0; --i) {
- if (auto found = TrackBindlessSampler((*operation)[i - 1], code, cursor);
- std::get<0>(found)) {
- // Cbuf found in operand.
+ const OperationNode& op = *operation;
+
+ const OperationCode opcode = operation->GetCode();
+ if (opcode == OperationCode::IBitwiseOr || opcode == OperationCode::UBitwiseOr) {
+ ASSERT(op.GetOperandsCount() == 2);
+ auto [node_a, index_a, offset_a] = TrackCbuf(op[0], code, cursor);
+ auto [node_b, index_b, offset_b] = TrackCbuf(op[1], code, cursor);
+ if (node_a && node_b) {
+ auto track = MakeTrackSampler<SeparateSamplerNode>(std::pair{index_a, index_b},
+ std::pair{offset_a, offset_b});
+ return {tracked, std::move(track)};
+ }
+ }
+ std::size_t i = op.GetOperandsCount();
+ while (i--) {
+ if (auto found = TrackBindlessSampler(op[i - 1], code, cursor); std::get<0>(found)) {
+ // Constant buffer found in operand.
return found;
}
}
@@ -139,6 +141,26 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
return {};
}
+std::pair<Node, TrackSampler> ShaderIR::HandleBindlessIndirectRead(
+ const CbufNode& cbuf, const OperationNode& operation, Node gpr, Node base_offset, Node tracked,
+ const NodeBlock& code, s64 cursor) {
+ const auto offset_imm = std::get<ImmediateNode>(*base_offset);
+ const auto& gpu_driver = registry.AccessGuestDriverProfile();
+ const u32 bindless_cv = NewCustomVariable();
+ const u32 texture_handler_size = gpu_driver.GetTextureHandlerSize();
+ Node op = Operation(OperationCode::UDiv, gpr, Immediate(texture_handler_size));
+
+ Node cv_node = GetCustomVariable(bindless_cv);
+ Node amend_op = Operation(OperationCode::Assign, std::move(cv_node), std::move(op));
+ const std::size_t amend_index = DeclareAmend(std::move(amend_op));
+ AmendNodeCv(amend_index, code[cursor]);
+
+ // TODO: Implement bindless index custom variable
+ auto track =
+ MakeTrackSampler<ArraySamplerNode>(cbuf.GetIndex(), offset_imm.GetValue(), bindless_cv);
+ return {tracked, track};
+}
+
std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
s64 cursor) const {
if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
diff --git a/src/video_core/shader_cache.h b/src/video_core/shader_cache.h
new file mode 100644
index 000000000..2dd270e99
--- /dev/null
+++ b/src/video_core/shader_cache.h
@@ -0,0 +1,228 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+template <class T>
+class ShaderCache {
+ static constexpr u64 PAGE_BITS = 14;
+
+ struct Entry {
+ VAddr addr_start;
+ VAddr addr_end;
+ T* data;
+
+ bool is_memory_marked = true;
+
+ constexpr bool Overlaps(VAddr start, VAddr end) const noexcept {
+ return start < addr_end && addr_start < end;
+ }
+ };
+
+public:
+ virtual ~ShaderCache() = default;
+
+ /// @brief Removes shaders inside a given region
+ /// @note Checks for ranges
+ /// @param addr Start address of the invalidation
+ /// @param size Number of bytes of the invalidation
+ void InvalidateRegion(VAddr addr, std::size_t size) {
+ std::scoped_lock lock{invalidation_mutex};
+ InvalidatePagesInRegion(addr, size);
+ RemovePendingShaders();
+ }
+
+ /// @brief Unmarks a memory region as cached and marks it for removal
+ /// @param addr Start address of the CPU write operation
+ /// @param size Number of bytes of the CPU write operation
+ void OnCPUWrite(VAddr addr, std::size_t size) {
+ std::lock_guard lock{invalidation_mutex};
+ InvalidatePagesInRegion(addr, size);
+ }
+
+ /// @brief Flushes delayed removal operations
+ void SyncGuestHost() {
+ std::scoped_lock lock{invalidation_mutex};
+ RemovePendingShaders();
+ }
+
+ /// @brief Tries to obtain a cached shader starting in a given address
+ /// @note Doesn't check for ranges, the given address has to be the start of the shader
+ /// @param addr Start address of the shader, this doesn't cache for region
+ /// @return Pointer to a valid shader, nullptr when nothing is found
+ T* TryGet(VAddr addr) const {
+ std::scoped_lock lock{lookup_mutex};
+
+ const auto it = lookup_cache.find(addr);
+ if (it == lookup_cache.end()) {
+ return nullptr;
+ }
+ return it->second->data;
+ }
+
+protected:
+ explicit ShaderCache(VideoCore::RasterizerInterface& rasterizer_) : rasterizer{rasterizer_} {}
+
+ /// @brief Register in the cache a given entry
+ /// @param data Shader to store in the cache
+ /// @param addr Start address of the shader that will be registered
+ /// @param size Size in bytes of the shader
+ void Register(std::unique_ptr<T> data, VAddr addr, std::size_t size) {
+ std::scoped_lock lock{invalidation_mutex, lookup_mutex};
+
+ const VAddr addr_end = addr + size;
+ Entry* const entry = NewEntry(addr, addr_end, data.get());
+
+ const u64 page_end = addr_end >> PAGE_BITS;
+ for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+ invalidation_cache[page].push_back(entry);
+ }
+
+ storage.push_back(std::move(data));
+
+ rasterizer.UpdatePagesCachedCount(addr, size, 1);
+ }
+
+ /// @brief Called when a shader is going to be removed
+ /// @param shader Shader that will be removed
+ /// @pre invalidation_cache is locked
+ /// @pre lookup_mutex is locked
+ virtual void OnShaderRemoval([[maybe_unused]] T* shader) {}
+
+private:
+ /// @brief Invalidate pages in a given region
+ /// @pre invalidation_mutex is locked
+ void InvalidatePagesInRegion(VAddr addr, std::size_t size) {
+ const VAddr addr_end = addr + size;
+ const u64 page_end = addr_end >> PAGE_BITS;
+ for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+ const auto it = invalidation_cache.find(page);
+ if (it == invalidation_cache.end()) {
+ continue;
+ }
+
+ std::vector<Entry*>& entries = it->second;
+ InvalidatePageEntries(entries, addr, addr_end);
+
+ // If there's nothing else in this page, remove it to avoid overpopulating the hash map.
+ if (entries.empty()) {
+ invalidation_cache.erase(it);
+ }
+ }
+ }
+
+ /// @brief Remove shaders marked for deletion
+ /// @pre invalidation_mutex is locked
+ void RemovePendingShaders() {
+ if (marked_for_removal.empty()) {
+ return;
+ }
+ std::scoped_lock lock{lookup_mutex};
+
+ std::vector<T*> removed_shaders;
+ removed_shaders.reserve(marked_for_removal.size());
+
+ for (Entry* const entry : marked_for_removal) {
+ if (lookup_cache.erase(entry->addr_start) > 0) {
+ removed_shaders.push_back(entry->data);
+ }
+ }
+ marked_for_removal.clear();
+
+ if (!removed_shaders.empty()) {
+ RemoveShadersFromStorage(std::move(removed_shaders));
+ }
+ }
+
+ /// @brief Invalidates entries in a given range for the passed page
+ /// @param entries Vector of entries in the page, it will be modified on overlaps
+ /// @param addr Start address of the invalidation
+ /// @param addr_end Non-inclusive end address of the invalidation
+ /// @pre invalidation_mutex is locked
+ void InvalidatePageEntries(std::vector<Entry*>& entries, VAddr addr, VAddr addr_end) {
+ auto it = entries.begin();
+ while (it != entries.end()) {
+ Entry* const entry = *it;
+ if (!entry->Overlaps(addr, addr_end)) {
+ ++it;
+ continue;
+ }
+ UnmarkMemory(entry);
+ marked_for_removal.push_back(entry);
+
+ it = entries.erase(it);
+ }
+ }
+
+ /// @brief Unmarks an entry from the rasterizer cache
+ /// @param entry Entry to unmark from memory
+ void UnmarkMemory(Entry* entry) {
+ if (!entry->is_memory_marked) {
+ return;
+ }
+ entry->is_memory_marked = false;
+
+ const VAddr addr = entry->addr_start;
+ const std::size_t size = entry->addr_end - addr;
+ rasterizer.UpdatePagesCachedCount(addr, size, -1);
+ }
+
+ /// @brief Removes a vector of shaders from a list
+ /// @param removed_shaders Shaders to be removed from the storage, it can contain duplicates
+ /// @pre invalidation_mutex is locked
+ /// @pre lookup_mutex is locked
+ void RemoveShadersFromStorage(std::vector<T*> removed_shaders) {
+ // Remove duplicates
+ std::sort(removed_shaders.begin(), removed_shaders.end());
+ removed_shaders.erase(std::unique(removed_shaders.begin(), removed_shaders.end()),
+ removed_shaders.end());
+
+ // Now that there are no duplicates, we can notify removals
+ for (T* const shader : removed_shaders) {
+ OnShaderRemoval(shader);
+ }
+
+ // Remove them from the cache
+ const auto is_removed = [&removed_shaders](std::unique_ptr<T>& shader) {
+ return std::find(removed_shaders.begin(), removed_shaders.end(), shader.get()) !=
+ removed_shaders.end();
+ };
+ storage.erase(std::remove_if(storage.begin(), storage.end(), is_removed), storage.end());
+ }
+
+ /// @brief Creates a new entry in the lookup cache and returns its pointer
+ /// @pre lookup_mutex is locked
+ Entry* NewEntry(VAddr addr, VAddr addr_end, T* data) {
+ auto entry = std::make_unique<Entry>(Entry{addr, addr_end, data});
+ Entry* const entry_pointer = entry.get();
+
+ lookup_cache.emplace(addr, std::move(entry));
+ return entry_pointer;
+ }
+
+ VideoCore::RasterizerInterface& rasterizer;
+
+ mutable std::mutex lookup_mutex;
+ std::mutex invalidation_mutex;
+
+ std::unordered_map<u64, std::unique_ptr<Entry>> lookup_cache;
+ std::unordered_map<u64, std::vector<Entry*>> invalidation_cache;
+ std::vector<std::unique_ptr<T>> storage;
+ std::vector<Entry*> marked_for_removal;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp
index 715f39d0d..0caf3b4f0 100644
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -120,6 +120,9 @@ std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap(
}
const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)};
const auto layer{static_cast<u32>(relative_address / layer_size)};
+ if (layer >= params.depth) {
+ return {};
+ }
const GPUVAddr mipmap_address = relative_address - layer_size * layer;
const auto mipmap_it =
Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address);
@@ -248,12 +251,11 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
// Use an extra temporal buffer
auto& tmp_buffer = staging_cache.GetBuffer(1);
- // Special case for 3D Texture Segments
- const bool must_read_current_data =
- params.block_depth > 0 && params.target == VideoCore::Surface::SurfaceTarget::Texture2D;
tmp_buffer.resize(guest_memory_size);
host_ptr = tmp_buffer.data();
- if (must_read_current_data) {
+
+ if (params.target == SurfaceTarget::Texture3D) {
+ // Special case for 3D texture segments
memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
}
diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h
index 79e10ffbb..173f2edba 100644
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -217,8 +217,8 @@ public:
}
bool IsProtected() const {
- // Only 3D Slices are to be protected
- return is_target && params.block_depth > 0;
+ // Only 3D slices are to be protected
+ return is_target && params.target == SurfaceTarget::Texture3D;
}
bool IsRenderTarget() const {
@@ -250,6 +250,11 @@ public:
return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels));
}
+ TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) {
+ return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth,
+ base_level, num_levels));
+ }
+
std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params,
const GPUVAddr view_addr,
const std::size_t candidate_size, const u32 mipmap,
@@ -272,8 +277,8 @@ public:
std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr,
const std::size_t candidate_size) {
if (params.target == SurfaceTarget::Texture3D ||
- (params.num_levels == 1 && !params.is_layered) ||
- view_params.target == SurfaceTarget::Texture3D) {
+ view_params.target == SurfaceTarget::Texture3D ||
+ (params.num_levels == 1 && !params.is_layered)) {
return {};
}
const auto layer_mipmap{GetLayerMipmap(view_addr)};
diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp
index 884fabffe..0b2b2b8c4 100644
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -215,10 +215,19 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz
params.num_levels = 1;
params.emulated_levels = 1;
- const bool is_layered = config.layers > 1 && params.block_depth == 0;
- params.is_layered = is_layered;
- params.depth = is_layered ? config.layers.Value() : 1;
- params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D;
+ if (config.memory_layout.is_3d != 0) {
+ params.depth = config.layers.Value();
+ params.is_layered = false;
+ params.target = SurfaceTarget::Texture3D;
+ } else if (config.layers > 1) {
+ params.depth = config.layers.Value();
+ params.is_layered = true;
+ params.target = SurfaceTarget::Texture2DArray;
+ } else {
+ params.depth = 1;
+ params.is_layered = false;
+ params.target = SurfaceTarget::Texture2D;
+ }
return params;
}
@@ -237,7 +246,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
params.width = config.width;
params.height = config.height;
params.pitch = config.pitch;
- // TODO(Rodrigo): Try to guess the surface target from depth and layer parameters
+ // TODO(Rodrigo): Try to guess texture arrays from parameters
params.target = SurfaceTarget::Texture2D;
params.depth = 1;
params.num_levels = 1;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 6f63217a2..6207d8dfe 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -24,6 +24,7 @@
#include "core/core.h"
#include "core/memory.h"
#include "core/settings.h"
+#include "video_core/compatible_formats.h"
#include "video_core/dirty_flags.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/engines/maxwell_3d.h"
@@ -47,8 +48,8 @@ class RasterizerInterface;
namespace VideoCommon {
+using VideoCore::Surface::FormatCompatibility;
using VideoCore::Surface::PixelFormat;
-
using VideoCore::Surface::SurfaceTarget;
using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;
@@ -298,15 +299,13 @@ public:
const GPUVAddr src_gpu_addr = src_config.Address();
const GPUVAddr dst_gpu_addr = dst_config.Address();
DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr);
- const std::optional<VAddr> dst_cpu_addr =
- system.GPU().MemoryManager().GpuToCpuAddress(dst_gpu_addr);
- const std::optional<VAddr> src_cpu_addr =
- system.GPU().MemoryManager().GpuToCpuAddress(src_gpu_addr);
- std::pair<TSurface, TView> dst_surface =
- GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
- std::pair<TSurface, TView> src_surface =
- GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false);
- ImageBlit(src_surface.second, dst_surface.second, copy_config);
+
+ const auto& memory_manager = system.GPU().MemoryManager();
+ const std::optional<VAddr> dst_cpu_addr = memory_manager.GpuToCpuAddress(dst_gpu_addr);
+ const std::optional<VAddr> src_cpu_addr = memory_manager.GpuToCpuAddress(src_gpu_addr);
+ std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false);
+ TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second;
+ ImageBlit(src_surface, dst_surface.second, copy_config);
dst_surface.first->MarkAsModified(true, Tick());
}
@@ -508,12 +507,12 @@ private:
return RecycleStrategy::Flush;
}
// 3D Textures decision
- if (params.block_depth > 1 || params.target == SurfaceTarget::Texture3D) {
+ if (params.target == SurfaceTarget::Texture3D) {
return RecycleStrategy::Flush;
}
for (const auto& s : overlaps) {
const auto& s_params = s->GetSurfaceParams();
- if (s_params.block_depth > 1 || s_params.target == SurfaceTarget::Texture3D) {
+ if (s_params.target == SurfaceTarget::Texture3D) {
return RecycleStrategy::Flush;
}
}
@@ -597,7 +596,7 @@ private:
} else {
new_surface = GetUncachedSurface(gpu_addr, params);
}
- const auto& final_params = new_surface->GetSurfaceParams();
+ const SurfaceParams& final_params = new_surface->GetSurfaceParams();
if (cr_params.type != final_params.type) {
if (Settings::IsGPULevelExtreme()) {
BufferCopy(current_surface, new_surface);
@@ -605,7 +604,7 @@ private:
} else {
std::vector<CopyParams> bricks = current_surface->BreakDown(final_params);
for (auto& brick : bricks) {
- ImageCopy(current_surface, new_surface, brick);
+ TryCopyImage(current_surface, new_surface, brick);
}
}
Unregister(current_surface);
@@ -696,7 +695,7 @@ private:
}
const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height,
src_params.depth);
- ImageCopy(surface, new_surface, copy_params);
+ TryCopyImage(surface, new_surface, copy_params);
}
}
if (passed_tests == 0) {
@@ -731,51 +730,9 @@ private:
*/
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps,
const SurfaceParams& params,
- const GPUVAddr gpu_addr,
- const VAddr cpu_addr,
+ GPUVAddr gpu_addr, VAddr cpu_addr,
bool preserve_contents) {
- if (params.target == SurfaceTarget::Texture3D) {
- bool failed = false;
- if (params.num_levels > 1) {
- // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
- return std::nullopt;
- }
- TSurface new_surface = GetUncachedSurface(gpu_addr, params);
- bool modified = false;
- for (auto& surface : overlaps) {
- const SurfaceParams& src_params = surface->GetSurfaceParams();
- if (src_params.target != SurfaceTarget::Texture2D) {
- failed = true;
- break;
- }
- if (src_params.height != params.height) {
- failed = true;
- break;
- }
- if (src_params.block_depth != params.block_depth ||
- src_params.block_height != params.block_height) {
- failed = true;
- break;
- }
- const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
- const auto offsets = params.GetBlockOffsetXYZ(offset);
- const auto z = std::get<2>(offsets);
- modified |= surface->IsModified();
- const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
- 1);
- ImageCopy(surface, new_surface, copy_params);
- }
- if (failed) {
- return std::nullopt;
- }
- for (const auto& surface : overlaps) {
- Unregister(surface);
- }
- new_surface->MarkAsModified(modified, Tick());
- Register(new_surface);
- auto view = new_surface->GetMainView();
- return {{std::move(new_surface), view}};
- } else {
+ if (params.target != SurfaceTarget::Texture3D) {
for (const auto& surface : overlaps) {
if (!surface->MatchTarget(params.target)) {
if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) {
@@ -791,11 +748,60 @@ private:
continue;
}
if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
- return {{surface, surface->GetMainView()}};
+ return std::make_pair(surface, surface->GetMainView());
}
}
return InitializeSurface(gpu_addr, params, preserve_contents);
}
+
+ if (params.num_levels > 1) {
+ // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
+ return std::nullopt;
+ }
+
+ if (overlaps.size() == 1) {
+ const auto& surface = overlaps[0];
+ const SurfaceParams& overlap_params = surface->GetSurfaceParams();
+ // Don't attempt to render to textures with more than one level for now
+ // The texture has to be to the right or the sample address if we want to render to it
+ if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) {
+ const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr());
+ const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+ if (slice < overlap_params.depth) {
+ auto view = surface->Emplace3DView(slice, params.depth, 0, 1);
+ return std::make_pair(std::move(surface), std::move(view));
+ }
+ }
+ }
+
+ TSurface new_surface = GetUncachedSurface(gpu_addr, params);
+ bool modified = false;
+
+ for (auto& surface : overlaps) {
+ const SurfaceParams& src_params = surface->GetSurfaceParams();
+ if (src_params.target != SurfaceTarget::Texture2D ||
+ src_params.height != params.height ||
+ src_params.block_depth != params.block_depth ||
+ src_params.block_height != params.block_height) {
+ return std::nullopt;
+ }
+ modified |= surface->IsModified();
+
+ const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr);
+ const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset));
+ const u32 width = params.width;
+ const u32 height = params.height;
+ const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1);
+ TryCopyImage(surface, new_surface, copy_params);
+ }
+ for (const auto& surface : overlaps) {
+ Unregister(surface);
+ }
+ new_surface->MarkAsModified(modified, Tick());
+ Register(new_surface);
+
+ TView view = new_surface->GetMainView();
+ return std::make_pair(std::move(new_surface), std::move(view));
}
/**
@@ -873,7 +879,7 @@ private:
}
}
- // Check if it's a 3D texture
+ // Manage 3D textures
if (params.block_depth > 0) {
auto surface =
Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents);
@@ -1048,7 +1054,7 @@ private:
void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params,
const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) {
auto deduced_src = DeduceSurface(src_gpu_addr, src_params);
- auto deduced_dst = DeduceSurface(src_gpu_addr, src_params);
+ auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params);
if (deduced_src.Failed() || deduced_dst.Failed()) {
return;
}
@@ -1187,6 +1193,19 @@ private:
return {};
}
+ /// Try to do an image copy logging when formats are incompatible.
+ void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) {
+ const SurfaceParams& src_params = src->GetSurfaceParams();
+ const SurfaceParams& dst_params = dst->GetSurfaceParams();
+ if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) {
+ LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}",
+ static_cast<int>(dst_params.pixel_format),
+ static_cast<int>(src_params.pixel_format));
+ return;
+ }
+ ImageCopy(src, dst, copy);
+ }
+
constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
return siblings_table[static_cast<std::size_t>(format)];
}
@@ -1236,6 +1255,7 @@ private:
VideoCore::RasterizerInterface& rasterizer;
FormatLookupTable format_lookup_table;
+ FormatCompatibility format_compatibility;
u64 ticks{};