From 69093cf2d69490862aff974f170cee63a0016fd0 Mon Sep 17 00:00:00 2001 From: FICTURE7 Date: Sat, 9 Oct 2021 01:15:44 +0400 Subject: Optimize LSRA (#2563) * Optimize `TryAllocateRegWithtoutSpill` a bit * Add a fast path for when all registers are live. * Do not query `GetOverlapPosition` if the register is already in use (i.e: free position is 0). * Do not allocate child split list if not parent * Turn `LiveRange` into a reference struct `LiveRange` is now a reference wrapping struct like `Operand` and `Operation`. It has also been changed into a singly linked-list. In micro-benchmarks traversing the linked-list was faster than binary search on `List`. Even for quite large input sizes (e.g: 1,000,000), surprisingly. Could be because the code gen for traversing the linked-list is much much cleaner and there is no virtual dispatch happening when checking if intervals overlaps. * Turn `LiveInterval` into an iterator The LSRA allocates in forward order and never inspect previous `LiveInterval` once they are expired. Something similar can be done for the `LiveRange`s within the `LiveInterval`s themselves. The `LiveInterval` is turned into a iterator which expires `LiveRange` within it. The iterator is moved forward along with interval walking code, i.e: AllocateInterval(context, interval, cIndex). * Remove `LinearScanAllocator.Sources` Local methods are less susceptible to do allocations than lambdas. * Optimize `GetOverlapPosition(interval)` a bit Time complexity should be in O(n+m) instead of O(nm) now. * Optimize `NumberLocals` a bit Use the same idea as in `HybridAllocator` to store the visited state in the MSB of the Operand's value instead of using a `HashSet`. * Optimize `InsertSplitCopies` a bit Avoid allocating a redundant `CopyResolver`. * Optimize `InsertSplitCopiesAtEdges` a bit Avoid redundant allocations of `CopyResolver`. * Use stack allocation for `freePositions` Avoid redundant computations. * Add `UseList` Replace `SortedIntegerList` with an even more specialized data structure. It allocates memory on the arena allocators and does not require copying use positions when splitting it. * Turn `LiveInterval` into a reference struct `LiveInterval` is now a reference wrapping struct like `Operand` and `Operation`. The rationale behind turning this in a reference wrapping struct is because a `LiveInterval` is associated with each local variable, and these intervals may themselves be split further. I've seen translations having up to 8000 local variables. To make the `LiveInterval` unmanaged, a new data structure called `LiveIntervalList` was added to store child splits. This differs from `SortedList<,>` because it can contain intervals with the same start position. Really wished we got some more of C++ template in C#. :^( * Optimize `GetChildSplit` a bit No need to inspect the remaining ranges if we've reached a range which starts after position, since the split list is ordered. * Optimize `CopyResolver` a bit Lazily allocate the fill, spill and parallel copy structures since most of the time only one of them is needed. * Optimize `BitMap.Enumerator` a bit Marking `MoveNext` as `AggressiveInlining` allows RyuJIT to promote the `Enumerator` struct into registers completely, reducing load/store code a lot since it does not have to store the struct on the stack for ABI purposes. * Use stack allocation for `use/blockedPositions` * Optimize `AllocateWithSpill` a bit * Address feedback * Make `LiveInterval.AddRange(,)` more conservative Produces no diff against master, but just for good measure. --- .../CodeGen/RegisterAllocators/CopyResolver.cs | 51 +++++++++++++--------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'ARMeilleure/CodeGen/RegisterAllocators/CopyResolver.cs') diff --git a/ARMeilleure/CodeGen/RegisterAllocators/CopyResolver.cs b/ARMeilleure/CodeGen/RegisterAllocators/CopyResolver.cs index cc731b74..df4b6db1 100644 --- a/ARMeilleure/CodeGen/RegisterAllocators/CopyResolver.cs +++ b/ARMeilleure/CodeGen/RegisterAllocators/CopyResolver.cs @@ -1,6 +1,7 @@ using ARMeilleure.IntermediateRepresentation; using System; using System.Collections.Generic; + using static ARMeilleure.IntermediateRepresentation.Operand.Factory; using static ARMeilleure.IntermediateRepresentation.Operation.Factory; @@ -25,7 +26,7 @@ namespace ARMeilleure.CodeGen.RegisterAllocators } } - private List _copies; + private readonly List _copies; public int Count => _copies.Count; @@ -146,21 +147,12 @@ namespace ARMeilleure.CodeGen.RegisterAllocators } } - private Queue _fillQueue = new Queue(); - private Queue _spillQueue = new Queue(); - - private ParallelCopy _parallelCopy; + private Queue _fillQueue = null; + private Queue _spillQueue = null; + private ParallelCopy _parallelCopy = null; public bool HasCopy { get; private set; } - public CopyResolver() - { - _fillQueue = new Queue(); - _spillQueue = new Queue(); - - _parallelCopy = new ParallelCopy(); - } - public void AddSplit(LiveInterval left, LiveInterval right) { if (left.Local != right.Local) @@ -194,8 +186,12 @@ namespace ARMeilleure.CodeGen.RegisterAllocators private void AddSplitFill(LiveInterval left, LiveInterval right, OperandType type) { - Operand register = GetRegister(right.Register, type); + if (_fillQueue == null) + { + _fillQueue = new Queue(); + } + Operand register = GetRegister(right.Register, type); Operand offset = Const(left.SpillOffset); _fillQueue.Enqueue(Operation(Instruction.Fill, register, offset)); @@ -205,8 +201,12 @@ namespace ARMeilleure.CodeGen.RegisterAllocators private void AddSplitSpill(LiveInterval left, LiveInterval right, OperandType type) { - Operand offset = Const(right.SpillOffset); + if (_spillQueue == null) + { + _spillQueue = new Queue(); + } + Operand offset = Const(right.SpillOffset); Operand register = GetRegister(left.Register, type); _spillQueue.Enqueue(Operation(Instruction.Spill, default, offset, register)); @@ -216,6 +216,11 @@ namespace ARMeilleure.CodeGen.RegisterAllocators private void AddSplitCopy(LiveInterval left, LiveInterval right, OperandType type) { + if (_parallelCopy == null) + { + _parallelCopy = new ParallelCopy(); + } + _parallelCopy.AddCopy(right.Register, left.Register, type); HasCopy = true; @@ -225,16 +230,22 @@ namespace ARMeilleure.CodeGen.RegisterAllocators { List sequence = new List(); - while (_spillQueue.TryDequeue(out Operation spillOp)) + if (_spillQueue != null) { - sequence.Add(spillOp); + while (_spillQueue.TryDequeue(out Operation spillOp)) + { + sequence.Add(spillOp); + } } - _parallelCopy.Sequence(sequence); + _parallelCopy?.Sequence(sequence); - while (_fillQueue.TryDequeue(out Operation fillOp)) + if (_fillQueue != null) { - sequence.Add(fillOp); + while (_fillQueue.TryDequeue(out Operation fillOp)) + { + sequence.Add(fillOp); + } } return sequence.ToArray(); -- cgit v1.2.3