7 files changed, 279 insertions, 77 deletions
diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp
index 77151a24b..d54fb88c9 100644
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -44,7 +44,6 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
     switch (opcode->get().GetId()) {
     case OpCode::Id::SUST: {
         UNIMPLEMENTED_IF(instr.sust.mode != Tegra::Shader::SurfaceDataMode::P);
-        UNIMPLEMENTED_IF(instr.sust.image_type == Tegra::Shader::ImageType::TextureBuffer);
         UNIMPLEMENTED_IF(instr.sust.out_of_bounds_store != Tegra::Shader::OutOfBoundsStore::Ignore);
         UNIMPLEMENTED_IF(instr.sust.component_mask_selector != 0xf); // Ensure we have an RGBA store
 
@@ -61,56 +60,105 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
         }
 
         const auto type{instr.sust.image_type};
-        const auto& image{instr.sust.is_immediate ? GetImage(instr.image, type)
-                                                  : GetBindlessImage(instr.gpr39, type)};
+        auto& image{instr.sust.is_immediate ? GetImage(instr.image, type)
+                                            : GetBindlessImage(instr.gpr39, type)};
+        image.MarkWrite();
+
         MetaImage meta{image, values};
-        const Node store{Operation(OperationCode::ImageStore, meta, std::move(coords))};
-        bb.push_back(store);
+        bb.push_back(Operation(OperationCode::ImageStore, meta, std::move(coords)));
+        break;
+    }
+    case OpCode::Id::SUATOM: {
+        UNIMPLEMENTED_IF(instr.suatom_d.is_ba != 0);
+
+        Node value = GetRegister(instr.gpr0);
+
+        std::vector<Node> coords;
+        const std::size_t num_coords{GetImageTypeNumCoordinates(instr.sust.image_type)};
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + i));
+        }
+
+        const OperationCode operation_code = [instr] {
+            switch (instr.suatom_d.operation) {
+            case Tegra::Shader::ImageAtomicOperation::Add:
+                return OperationCode::AtomicImageAdd;
+            case Tegra::Shader::ImageAtomicOperation::Min:
+                return OperationCode::AtomicImageMin;
+            case Tegra::Shader::ImageAtomicOperation::Max:
+                return OperationCode::AtomicImageMax;
+            case Tegra::Shader::ImageAtomicOperation::And:
+                return OperationCode::AtomicImageAnd;
+            case Tegra::Shader::ImageAtomicOperation::Or:
+                return OperationCode::AtomicImageOr;
+            case Tegra::Shader::ImageAtomicOperation::Xor:
+                return OperationCode::AtomicImageXor;
+            case Tegra::Shader::ImageAtomicOperation::Exch:
+                return OperationCode::AtomicImageExchange;
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented operation={}",
+                                  static_cast<u32>(instr.suatom_d.operation.Value()));
+                return OperationCode::AtomicImageAdd;
+            }
+        }();
+
+        const auto& image{GetImage(instr.image, instr.suatom_d.image_type, instr.suatom_d.size)};
+        MetaImage meta{image, {std::move(value)}};
+        SetRegister(bb, instr.gpr0, Operation(operation_code, meta, std::move(coords)));
         break;
     }
     default:
-        UNIMPLEMENTED_MSG("Unhandled conversion instruction: {}", opcode->get().GetName());
+        UNIMPLEMENTED_MSG("Unhandled image instruction: {}", opcode->get().GetName());
     }
 
     return pc;
 }
 
-const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) {
+Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type,
+                          std::optional<Tegra::Shader::ImageAtomicSize> size) {
     const auto offset{static_cast<std::size_t>(image.index.Value())};
-
-    // If this image has already been used, return the existing mapping.
-    const auto itr{std::find_if(used_images.begin(), used_images.end(),
-                                [=](const Image& entry) { return entry.GetOffset() == offset; })};
-    if (itr != used_images.end()) {
-        ASSERT(itr->GetType() == type);
-        return *itr;
+    if (const auto image = TryUseExistingImage(offset, type, size)) {
+        return *image;
     }
 
-    // Otherwise create a new mapping for this image.
     const std::size_t next_index{used_images.size()};
-    const Image entry{offset, next_index, type};
-    return *used_images.emplace(entry).first;
+    return used_images.emplace(offset, Image{offset, next_index, type, size}).first->second;
 }
 
-const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,
-                                        Tegra::Shader::ImageType type) {
+Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type,
+                                  std::optional<Tegra::Shader::ImageAtomicSize> size) {
     const Node image_register{GetRegister(reg)};
     const auto [base_image, cbuf_index, cbuf_offset]{
         TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
     const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};
 
-    // If this image has already been used, return the existing mapping.
-    const auto itr{std::find_if(used_images.begin(), used_images.end(),
-                                [=](const Image& entry) { return entry.GetOffset() == cbuf_key; })};
-    if (itr != used_images.end()) {
-        ASSERT(itr->GetType() == type);
-        return *itr;
+    if (const auto image = TryUseExistingImage(cbuf_key, type, size)) {
+        return *image;
     }
 
-    // Otherwise create a new mapping for this image.
     const std::size_t next_index{used_images.size()};
-    const Image entry{cbuf_index, cbuf_offset, next_index, type};
-    return *used_images.emplace(entry).first;
+    return used_images.emplace(cbuf_key, Image{cbuf_index, cbuf_offset, next_index, type, size})
+        .first->second;
+}
+
+Image* ShaderIR::TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type,
+                                     std::optional<Tegra::Shader::ImageAtomicSize> size) {
+    auto it = used_images.find(offset);
+    if (it == used_images.end()) {
+        return nullptr;
+    }
+    auto& image = it->second;
+    ASSERT(image.GetType() == type);
+
+    if (size) {
+        // We know the size, if it's known it has to be the same as before, otherwise we can set it.
+        if (image.IsSizeKnown()) {
+            ASSERT(image.GetSize() == size);
+        } else {
+            image.SetSize(*size);
+        }
+    }
+    return &image;
 }
 
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ed108bea8..7923d4d69 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -35,7 +35,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
         return 1;
     }
 }
-} // namespace
+} // Anonymous namespace
 
 u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
@@ -106,16 +106,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
-    case OpCode::Id::LD_L: {
-        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}",
-                  static_cast<u64>(instr.ld_l.unknown.Value()));
-
-        const auto GetLmem = [&](s32 offset) {
+    case OpCode::Id::LD_L:
+        LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown));
+        [[fallthrough]];
+    case OpCode::Id::LD_S: {
+        const auto GetMemory = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate_offset = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             const Node address = Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8),
                                            immediate_offset);
-            return GetLocalMemory(address);
+            return opcode->get().GetId() == OpCode::Id::LD_S ? GetSharedMemory(address)
+                                                             : GetLocalMemory(address);
         };
 
         switch (instr.ldst_sl.type.Value()) {
@@ -135,14 +136,16 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                     return 0;
                 }
             }();
-            for (u32 i = 0; i < count; ++i)
-                SetTemporary(bb, i, GetLmem(i * 4));
-            for (u32 i = 0; i < count; ++i)
+            for (u32 i = 0; i < count; ++i) {
+                SetTemporary(bb, i, GetMemory(i * 4));
+            }
+            for (u32 i = 0; i < count; ++i) {
                 SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
+            }
             break;
         }
         default:
-            UNIMPLEMENTED_MSG("LD_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
@@ -209,27 +212,34 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 
         break;
     }
-    case OpCode::Id::ST_L: {
+    case OpCode::Id::ST_L:
         LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}",
                   static_cast<u64>(instr.st_l.cache_management.Value()));
-
-        const auto GetLmemAddr = [&](s32 offset) {
+        [[fallthrough]];
+    case OpCode::Id::ST_S: {
+        const auto GetAddress = [&](s32 offset) {
             ASSERT(offset % 4 == 0);
             const Node immediate = Immediate(static_cast<s32>(instr.smem_imm) + offset);
             return Operation(OperationCode::IAdd, NO_PRECISE, GetRegister(instr.gpr8), immediate);
         };
 
+        const auto set_memory = opcode->get().GetId() == OpCode::Id::ST_L
+                                    ? &ShaderIR::SetLocalMemory
+                                    : &ShaderIR::SetSharedMemory;
+
         switch (instr.ldst_sl.type.Value()) {
         case Tegra::Shader::StoreType::Bits128:
-            SetLocalMemory(bb, GetLmemAddr(12), GetRegister(instr.gpr0.Value() + 3));
-            SetLocalMemory(bb, GetLmemAddr(8), GetRegister(instr.gpr0.Value() + 2));
+            (this->*set_memory)(bb, GetAddress(12), GetRegister(instr.gpr0.Value() + 3));
+            (this->*set_memory)(bb, GetAddress(8), GetRegister(instr.gpr0.Value() + 2));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits64:
-            SetLocalMemory(bb, GetLmemAddr(4), GetRegister(instr.gpr0.Value() + 1));
+            (this->*set_memory)(bb, GetAddress(4), GetRegister(instr.gpr0.Value() + 1));
+            [[fallthrough]];
         case Tegra::Shader::StoreType::Bits32:
-            SetLocalMemory(bb, GetLmemAddr(0), GetRegister(instr.gpr0));
+            (this->*set_memory)(bb, GetAddress(0), GetRegister(instr.gpr0));
             break;
         default:
-            UNIMPLEMENTED_MSG("ST_L Unhandled type: {}",
+            UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(),
                               static_cast<u32>(instr.ldst_sl.type.Value()));
         }
         break;
diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp
index 2ac16eeb0..f6ee68a54 100644
--- a/src/video_core/shader/decode/shift.cpp
+++ b/src/video_core/shader/decode/shift.cpp
@@ -17,8 +17,8 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
 
-    const Node op_a = GetRegister(instr.gpr8);
-    const Node op_b = [&]() {
+    Node op_a = GetRegister(instr.gpr8);
+    Node op_b = [&]() {
         if (instr.is_b_imm) {
             return Immediate(instr.alu.GetSignedImm20_20());
         } else if (instr.is_b_gpr) {
@@ -32,16 +32,23 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) {
     case OpCode::Id::SHR_C:
     case OpCode::Id::SHR_R:
     case OpCode::Id::SHR_IMM: {
-        const Node value = SignedOperation(OperationCode::IArithmeticShiftRight,
-                                           instr.shift.is_signed, PRECISE, op_a, op_b);
+        if (instr.shr.wrap) {
+            op_b = Operation(OperationCode::UBitwiseAnd, std::move(op_b), Immediate(0x1f));
+        } else {
+            op_b = Operation(OperationCode::IMax, std::move(op_b), Immediate(0));
+            op_b = Operation(OperationCode::IMin, std::move(op_b), Immediate(31));
+        }
+
+        Node value = SignedOperation(OperationCode::IArithmeticShiftRight, instr.shift.is_signed,
+                                     std::move(op_a), std::move(op_b));
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, value);
+        SetRegister(bb, instr.gpr0, std::move(value));
         break;
     }
     case OpCode::Id::SHL_C:
     case OpCode::Id::SHL_R:
     case OpCode::Id::SHL_IMM: {
-        const Node value = Operation(OperationCode::ILogicalShiftLeft, PRECISE, op_a, op_b);
+        const Node value = Operation(OperationCode::ILogicalShiftLeft, op_a, op_b);
         SetInternalFlagsFromInteger(bb, value, instr.generates_cc);
         SetRegister(bb, instr.gpr0, value);
         break;
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
index 04ca74f46..a8e481b3c 100644
--- a/src/video_core/shader/decode/warp.cpp
+++ b/src/video_core/shader/decode/warp.cpp
@@ -13,6 +13,7 @@ namespace VideoCommon::Shader {
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Pred;
+using Tegra::Shader::ShuffleOperation;
 using Tegra::Shader::VoteOperation;
 
 namespace {
@@ -44,6 +45,52 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
         SetPredicate(bb, instr.vote.dest_pred, vote);
         break;
     }
+    case OpCode::Id::SHFL: {
+        Node mask = instr.shfl.is_mask_imm ? Immediate(static_cast<u32>(instr.shfl.mask_imm))
+                                           : GetRegister(instr.gpr39);
+        Node width = [&] {
+            // Convert the obscure SHFL mask back into GL_NV_shader_thread_shuffle's width. This has
+            // been done reversing Nvidia's math. It won't work on all cases due to SHFL having
+            // different parameters that don't properly map to GLSL's interface, but it should work
+            // for cases emitted by Nvidia's compiler.
+            if (instr.shfl.operation == ShuffleOperation::Up) {
+                return Operation(
+                    OperationCode::ILogicalShiftRight,
+                    Operation(OperationCode::IAdd, std::move(mask), Immediate(-0x2000)),
+                    Immediate(8));
+            } else {
+                return Operation(OperationCode::ILogicalShiftRight,
+                                 Operation(OperationCode::IAdd, Immediate(0x201F),
+                                           Operation(OperationCode::INegate, std::move(mask))),
+                                 Immediate(8));
+            }
+        }();
+
+        const auto [operation, in_range] = [instr]() -> std::pair<OperationCode, OperationCode> {
+            switch (instr.shfl.operation) {
+            case ShuffleOperation::Idx:
+                return {OperationCode::ShuffleIndexed, OperationCode::InRangeShuffleIndexed};
+            case ShuffleOperation::Up:
+                return {OperationCode::ShuffleUp, OperationCode::InRangeShuffleUp};
+            case ShuffleOperation::Down:
+                return {OperationCode::ShuffleDown, OperationCode::InRangeShuffleDown};
+            case ShuffleOperation::Bfly:
+                return {OperationCode::ShuffleButterfly, OperationCode::InRangeShuffleButterfly};
+            }
+            UNREACHABLE_MSG("Invalid SHFL operation: {}",
+                            static_cast<u64>(instr.shfl.operation.Value()));
+            return {};
+        }();
+
+        // Setting the predicate before the register is intentional to avoid overwriting.
+        Node index = instr.shfl.is_index_imm ? Immediate(static_cast<u32>(instr.shfl.index_imm))
+                                             : GetRegister(instr.gpr20);
+        SetPredicate(bb, instr.shfl.pred48, Operation(in_range, index, width));
+        SetRegister(
+            bb, instr.gpr0,
+            Operation(operation, GetRegister(instr.gpr8), std::move(index), std::move(width)));
+        break;
+    }
     default:
         UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
         break;
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 5db9313c4..abf2cb1ab 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -7,6 +7,7 @@
 #include <array>
 #include <cstddef>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -148,7 +149,14 @@ enum class OperationCode {
     TextureQueryLod,        /// (MetaTexture, float[N] coords) -> float4
     TexelFetch,             /// (MetaTexture, int[N], int) -> float4
 
-    ImageStore, /// (MetaImage, float[N] coords) -> void
+    ImageStore,          /// (MetaImage, int[N] values) -> void
+    AtomicImageAdd,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageMin,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageMax,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageAnd,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageOr,       /// (MetaImage, int[N] coords) -> void
+    AtomicImageXor,      /// (MetaImage, int[N] coords) -> void
+    AtomicImageExchange, /// (MetaImage, int[N] coords) -> void
 
     Branch,         /// (uint branch_target) -> void
     BranchIndirect, /// (uint branch_target) -> void
@@ -173,6 +181,16 @@ enum class OperationCode {
     VoteAny,      /// (bool) -> bool
     VoteEqual,    /// (bool) -> bool
 
+    ShuffleIndexed,   /// (uint value, uint index, uint width) -> uint
+    ShuffleUp,        /// (uint value, uint index, uint width) -> uint
+    ShuffleDown,      /// (uint value, uint index, uint width) -> uint
+    ShuffleButterfly, /// (uint value, uint index, uint width) -> uint
+
+    InRangeShuffleIndexed,   /// (uint index, uint width) -> bool
+    InRangeShuffleUp,        /// (uint index, uint width) -> bool
+    InRangeShuffleDown,      /// (uint index, uint width) -> bool
+    InRangeShuffleButterfly, /// (uint index, uint width) -> bool
+
     Amount,
 };
 
@@ -198,12 +216,13 @@ class PredicateNode;
 class AbufNode;
 class CbufNode;
 class LmemNode;
+class SmemNode;
 class GmemNode;
 class CommentNode;
 
 using NodeData =
     std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
-                 PredicateNode, AbufNode, CbufNode, LmemNode, GmemNode, CommentNode>;
+                 PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>;
 using Node = std::shared_ptr<NodeData>;
 using Node4 = std::array<Node, 4>;
 using NodeBlock = std::vector<Node>;
@@ -273,46 +292,85 @@ private:
     bool is_bindless{}; ///< Whether this sampler belongs to a bindless texture or not.
 };
 
-class Image {
+class Image final {
 public:
-    explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type)
-        : offset{offset}, index{index}, type{type}, is_bindless{false} {}
+    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
+        : offset{offset}, index{index}, type{type}, is_bindless{false}, size{size} {}
 
-    explicit Image(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
-                   Tegra::Shader::ImageType type)
+    constexpr explicit Image(u32 cbuf_index, u32 cbuf_offset, std::size_t index,
+                             Tegra::Shader::ImageType type,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
         : offset{(static_cast<u64>(cbuf_index) << 32) | cbuf_offset}, index{index}, type{type},
-          is_bindless{true} {}
+          is_bindless{true}, size{size} {}
 
-    explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
-                   bool is_bindless)
-        : offset{offset}, index{index}, type{type}, is_bindless{is_bindless} {}
+    constexpr explicit Image(std::size_t offset, std::size_t index, Tegra::Shader::ImageType type,
+                             bool is_bindless, bool is_written, bool is_read,
+                             std::optional<Tegra::Shader::ImageAtomicSize> size)
+        : offset{offset}, index{index}, type{type}, is_bindless{is_bindless},
+          is_written{is_written}, is_read{is_read}, size{size} {}
 
-    std::size_t GetOffset() const {
+    void MarkWrite() {
+        is_written = true;
+    }
+
+    void MarkRead() {
+        is_read = true;
+    }
+
+    void SetSize(Tegra::Shader::ImageAtomicSize size_) {
+        size = size_;
+    }
+
+    constexpr std::size_t GetOffset() const {
         return offset;
     }
 
-    std::size_t GetIndex() const {
+    constexpr std::size_t GetIndex() const {
         return index;
     }
 
-    Tegra::Shader::ImageType GetType() const {
+    constexpr Tegra::Shader::ImageType GetType() const {
         return type;
     }
 
-    bool IsBindless() const {
+    constexpr bool IsBindless() const {
         return is_bindless;
     }
 
-    bool operator<(const Image& rhs) const {
-        return std::tie(offset, index, type, is_bindless) <
-               std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_bindless);
+    constexpr bool IsWritten() const {
+        return is_written;
+    }
+
+    constexpr bool IsRead() const {
+        return is_read;
+    }
+
+    constexpr std::pair<u32, u32> GetBindlessCBuf() const {
+        return {static_cast<u32>(offset >> 32), static_cast<u32>(offset)};
+    }
+
+    constexpr bool IsSizeKnown() const {
+        return size.has_value();
+    }
+
+    constexpr Tegra::Shader::ImageAtomicSize GetSize() const {
+        return size.value();
+    }
+
+    constexpr bool operator<(const Image& rhs) const {
+        return std::tie(offset, index, type, size, is_bindless) <
+               std::tie(rhs.offset, rhs.index, rhs.type, rhs.size, rhs.is_bindless);
     }
 
 private:
-    std::size_t offset{};
+    u64 offset{};
     std::size_t index{};
     Tegra::Shader::ImageType type{};
     bool is_bindless{};
+    bool is_written{};
+    bool is_read{};
+    std::optional<Tegra::Shader::ImageAtomicSize> size{};
 };
 
 struct GlobalMemoryBase {
@@ -536,6 +594,19 @@ private:
     Node address;
 };
 
+/// Shared memory node
+class SmemNode final {
+public:
+    explicit SmemNode(Node address) : address{std::move(address)} {}
+
+    const Node& GetAddress() const {
+        return address;
+    }
+
+private:
+    Node address;
+};
+
 /// Global memory node
 class GmemNode final {
 public:
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 1e5c7f660..bbbab0bca 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -137,6 +137,10 @@ Node ShaderIR::GetLocalMemory(Node address) {
     return MakeNode<LmemNode>(std::move(address));
 }
 
+Node ShaderIR::GetSharedMemory(Node address) {
+    return MakeNode<SmemNode>(std::move(address));
+}
+
 Node ShaderIR::GetTemporary(u32 id) {
     return GetRegister(Register::ZeroIndex + 1 + id);
 }
@@ -378,6 +382,11 @@ void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
         Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
 }
 
+void ShaderIR::SetSharedMemory(NodeBlock& bb, Node address, Node value) {
+    bb.push_back(
+        Operation(OperationCode::Assign, GetSharedMemory(std::move(address)), std::move(value)));
+}
+
 void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
     SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
 }
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index bcc9b79b6..6aed9bb84 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -95,7 +95,7 @@ public:
         return used_samplers;
     }
 
-    const std::set<Image>& GetImages() const {
+    const std::map<u64, Image>& GetImages() const {
         return used_images;
     }
 
@@ -208,6 +208,8 @@ private:
     Node GetInternalFlag(InternalFlag flag, bool negated = false);
     /// Generates a node representing a local memory address
     Node GetLocalMemory(Node address);
+    /// Generates a node representing a shared memory address
+    Node GetSharedMemory(Node address);
     /// Generates a temporary, internally it uses a post-RZ register
     Node GetTemporary(u32 id);
 
@@ -217,8 +219,10 @@ private:
     void SetPredicate(NodeBlock& bb, u64 dest, Node src);
     /// Sets an internal flag. src value must be a bool-evaluated node
     void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
-    /// Sets a local memory address. address and value must be a number-evaluated node
+    /// Sets a local memory address with a value.
     void SetLocalMemory(NodeBlock& bb, Node address, Node value);
+    /// Sets a shared memory address with a value.
+    void SetSharedMemory(NodeBlock& bb, Node address, Node value);
     /// Sets a temporary. Internally it uses a post-RZ register
     void SetTemporary(NodeBlock& bb, u32 id, Node value);
 
@@ -272,10 +276,16 @@ private:
                                       bool is_shadow);
 
     /// Accesses an image.
-    const Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type);
+    Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type,
+                    std::optional<Tegra::Shader::ImageAtomicSize> size = {});
 
     /// Access a bindless image sampler.
-    const Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type);
+    Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type,
+                            std::optional<Tegra::Shader::ImageAtomicSize> size = {});
+
+    /// Tries to access an existing image, updating it's state as needed
+    Image* TryUseExistingImage(u64 offset, Tegra::Shader::ImageType type,
+                               std::optional<Tegra::Shader::ImageAtomicSize> size);
 
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
@@ -356,7 +366,7 @@ private:
     std::set<Tegra::Shader::Attribute::Index> used_output_attributes;
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
-    std::set<Image> used_images;
+    std::map<u64, Image> used_images;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
     std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
     bool uses_layer{};