1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// This transformation implements the well known scalar replacement of
11 /// aggregates transformation. It tries to identify promotable elements of an
12 /// aggregate alloca, and promote them to registers. It will also try to
13 /// convert uses of an element (or set of elements) of an alloca into a vector
14 /// or bitfield-style integer scalar if appropriate.
16 /// It works to do this with minimal slicing of the alloca so that regions
17 /// which are merely transferred in and out of external memory remain unchanged
18 /// and are not decomposed to scalar code.
20 /// Because this also performs alloca promotion, it can be thought of as also
21 /// serving the purpose of SSA formation. The algorithm iterates on the
22 /// function until all opportunities for promotion have been realized.
24 //===----------------------------------------------------------------------===//
26 #include "llvm/Transforms/Scalar.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/SetVector.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/Statistic.h"
31 #include "llvm/Analysis/AssumptionCache.h"
32 #include "llvm/Analysis/Loads.h"
33 #include "llvm/Analysis/PtrUseVisitor.h"
34 #include "llvm/Analysis/ValueTracking.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DIBuilder.h"
37 #include "llvm/IR/DataLayout.h"
38 #include "llvm/IR/DebugInfo.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Dominators.h"
41 #include "llvm/IR/Function.h"
42 #include "llvm/IR/IRBuilder.h"
43 #include "llvm/IR/InstVisitor.h"
44 #include "llvm/IR/Instructions.h"
45 #include "llvm/IR/IntrinsicInst.h"
46 #include "llvm/IR/LLVMContext.h"
47 #include "llvm/IR/Operator.h"
48 #include "llvm/Pass.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/Compiler.h"
51 #include "llvm/Support/Debug.h"
52 #include "llvm/Support/ErrorHandling.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Support/TimeValue.h"
55 #include "llvm/Support/raw_ostream.h"
56 #include "llvm/Transforms/Utils/Local.h"
57 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
58 #include "llvm/Transforms/Utils/SSAUpdater.h"
60 #if __cplusplus >= 201103L && !defined(NDEBUG)
61 // We only use this for a debug check in C++11
67 #define DEBUG_TYPE "sroa"
69 STATISTIC(NumAllocasAnalyzed
, "Number of allocas analyzed for replacement");
70 STATISTIC(NumAllocaPartitions
, "Number of alloca partitions formed");
71 STATISTIC(MaxPartitionsPerAlloca
, "Maximum number of partitions per alloca");
72 STATISTIC(NumAllocaPartitionUses
, "Number of alloca partition uses rewritten");
73 STATISTIC(MaxUsesPerAllocaPartition
, "Maximum number of uses of a partition");
74 STATISTIC(NumNewAllocas
, "Number of new, smaller allocas introduced");
75 STATISTIC(NumPromoted
, "Number of allocas promoted to SSA values");
76 STATISTIC(NumLoadsSpeculated
, "Number of loads speculated to allow promotion");
77 STATISTIC(NumDeleted
, "Number of instructions deleted");
78 STATISTIC(NumVectorized
, "Number of vectorized aggregates");
80 /// Hidden option to force the pass to not use DomTree and mem2reg, instead
81 /// forming SSA values through the SSAUpdater infrastructure.
82 static cl::opt
<bool> ForceSSAUpdater("force-ssa-updater", cl::init(false),
85 /// Hidden option to enable randomly shuffling the slices to help uncover
86 /// instability in their order.
87 static cl::opt
<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
88 cl::init(false), cl::Hidden
);
90 /// Hidden option to experiment with completely strict handling of inbounds
92 static cl::opt
<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
96 /// \brief A custom IRBuilder inserter which prefixes all names if they are
98 template <bool preserveNames
= true>
99 class IRBuilderPrefixedInserter
100 : public IRBuilderDefaultInserter
<preserveNames
> {
104 void SetNamePrefix(const Twine
&P
) { Prefix
= P
.str(); }
107 void InsertHelper(Instruction
*I
, const Twine
&Name
, BasicBlock
*BB
,
108 BasicBlock::iterator InsertPt
) const {
109 IRBuilderDefaultInserter
<preserveNames
>::InsertHelper(
110 I
, Name
.isTriviallyEmpty() ? Name
: Prefix
+ Name
, BB
, InsertPt
);
114 // Specialization for not preserving the name is trivial.
116 class IRBuilderPrefixedInserter
<false>
117 : public IRBuilderDefaultInserter
<false> {
119 void SetNamePrefix(const Twine
&P
) {}
122 /// \brief Provide a typedef for IRBuilder that drops names in release builds.
124 typedef llvm::IRBuilder
<true, ConstantFolder
, IRBuilderPrefixedInserter
<true>>
127 typedef llvm::IRBuilder
<false, ConstantFolder
, IRBuilderPrefixedInserter
<false>>
133 /// \brief A used slice of an alloca.
135 /// This structure represents a slice of an alloca used by some instruction. It
136 /// stores both the begin and end offsets of this use, a pointer to the use
137 /// itself, and a flag indicating whether we can classify the use as splittable
138 /// or not when forming partitions of the alloca.
140 /// \brief The beginning offset of the range.
141 uint64_t BeginOffset
;
143 /// \brief The ending offset, not included in the range.
146 /// \brief Storage for both the use of this slice and whether it can be
148 PointerIntPair
<Use
*, 1, bool> UseAndIsSplittable
;
151 Slice() : BeginOffset(), EndOffset() {}
152 Slice(uint64_t BeginOffset
, uint64_t EndOffset
, Use
*U
, bool IsSplittable
)
153 : BeginOffset(BeginOffset
), EndOffset(EndOffset
),
154 UseAndIsSplittable(U
, IsSplittable
) {}
156 uint64_t beginOffset() const { return BeginOffset
; }
157 uint64_t endOffset() const { return EndOffset
; }
159 bool isSplittable() const { return UseAndIsSplittable
.getInt(); }
160 void makeUnsplittable() { UseAndIsSplittable
.setInt(false); }
162 Use
*getUse() const { return UseAndIsSplittable
.getPointer(); }
164 bool isDead() const { return getUse() == nullptr; }
165 void kill() { UseAndIsSplittable
.setPointer(nullptr); }
167 /// \brief Support for ordering ranges.
169 /// This provides an ordering over ranges such that start offsets are
170 /// always increasing, and within equal start offsets, the end offsets are
171 /// decreasing. Thus the spanning range comes first in a cluster with the
172 /// same start position.
173 bool operator<(const Slice
&RHS
) const {
174 if (beginOffset() < RHS
.beginOffset())
176 if (beginOffset() > RHS
.beginOffset())
178 if (isSplittable() != RHS
.isSplittable())
179 return !isSplittable();
180 if (endOffset() > RHS
.endOffset())
185 /// \brief Support comparison with a single offset to allow binary searches.
186 friend LLVM_ATTRIBUTE_UNUSED
bool operator<(const Slice
&LHS
,
187 uint64_t RHSOffset
) {
188 return LHS
.beginOffset() < RHSOffset
;
190 friend LLVM_ATTRIBUTE_UNUSED
bool operator<(uint64_t LHSOffset
,
192 return LHSOffset
< RHS
.beginOffset();
195 bool operator==(const Slice
&RHS
) const {
196 return isSplittable() == RHS
.isSplittable() &&
197 beginOffset() == RHS
.beginOffset() && endOffset() == RHS
.endOffset();
199 bool operator!=(const Slice
&RHS
) const { return !operator==(RHS
); }
201 } // end anonymous namespace
204 template <typename T
> struct isPodLike
;
205 template <> struct isPodLike
<Slice
> { static const bool value
= true; };
209 /// \brief Representation of the alloca slices.
211 /// This class represents the slices of an alloca which are formed by its
212 /// various uses. If a pointer escapes, we can't fully build a representation
213 /// for the slices used and we reflect that in this structure. The uses are
214 /// stored, sorted by increasing beginning offset and with unsplittable slices
215 /// starting at a particular offset before splittable slices.
218 /// \brief Construct the slices of a particular alloca.
219 AllocaSlices(const DataLayout
&DL
, AllocaInst
&AI
);
221 /// \brief Test whether a pointer to the allocation escapes our analysis.
223 /// If this is true, the slices are never fully built and should be
225 bool isEscaped() const { return PointerEscapingInstr
; }
227 /// \brief Support for iterating over the slices.
229 typedef SmallVectorImpl
<Slice
>::iterator iterator
;
230 typedef iterator_range
<iterator
> range
;
231 iterator
begin() { return Slices
.begin(); }
232 iterator
end() { return Slices
.end(); }
234 typedef SmallVectorImpl
<Slice
>::const_iterator const_iterator
;
235 typedef iterator_range
<const_iterator
> const_range
;
236 const_iterator
begin() const { return Slices
.begin(); }
237 const_iterator
end() const { return Slices
.end(); }
240 /// \brief Erase a range of slices.
241 void erase(iterator Start
, iterator Stop
) { Slices
.erase(Start
, Stop
); }
243 /// \brief Insert new slices for this alloca.
245 /// This moves the slices into the alloca's slices collection, and re-sorts
246 /// everything so that the usual ordering properties of the alloca's slices
248 void insert(ArrayRef
<Slice
> NewSlices
) {
249 int OldSize
= Slices
.size();
250 std::move(NewSlices
.begin(), NewSlices
.end(), std::back_inserter(Slices
));
251 auto SliceI
= Slices
.begin() + OldSize
;
252 std::sort(SliceI
, Slices
.end());
253 std::inplace_merge(Slices
.begin(), SliceI
, Slices
.end());
256 // Forward declare an iterator to befriend it.
257 class partition_iterator
;
259 /// \brief A partition of the slices.
261 /// An ephemeral representation for a range of slices which can be viewed as
262 /// a partition of the alloca. This range represents a span of the alloca's
263 /// memory which cannot be split, and provides access to all of the slices
264 /// overlapping some part of the partition.
266 /// Objects of this type are produced by traversing the alloca's slices, but
267 /// are only ephemeral and not persistent.
270 friend class AllocaSlices
;
271 friend class AllocaSlices::partition_iterator
;
273 /// \brief The begining and ending offsets of the alloca for this partition.
274 uint64_t BeginOffset
, EndOffset
;
276 /// \brief The start end end iterators of this partition.
279 /// \brief A collection of split slice tails overlapping the partition.
280 SmallVector
<Slice
*, 4> SplitTails
;
282 /// \brief Raw constructor builds an empty partition starting and ending at
283 /// the given iterator.
284 Partition(iterator SI
) : SI(SI
), SJ(SI
) {}
287 /// \brief The start offset of this partition.
289 /// All of the contained slices start at or after this offset.
290 uint64_t beginOffset() const { return BeginOffset
; }
292 /// \brief The end offset of this partition.
294 /// All of the contained slices end at or before this offset.
295 uint64_t endOffset() const { return EndOffset
; }
297 /// \brief The size of the partition.
299 /// Note that this can never be zero.
300 uint64_t size() const {
301 assert(BeginOffset
< EndOffset
&& "Partitions must span some bytes!");
302 return EndOffset
- BeginOffset
;
305 /// \brief Test whether this partition contains no slices, and merely spans
306 /// a region occupied by split slices.
307 bool empty() const { return SI
== SJ
; }
309 /// \name Iterate slices that start within the partition.
310 /// These may be splittable or unsplittable. They have a begin offset >= the
311 /// partition begin offset.
313 // FIXME: We should probably define a "concat_iterator" helper and use that
314 // to stitch together pointee_iterators over the split tails and the
315 // contiguous iterators of the partition. That would give a much nicer
316 // interface here. We could then additionally expose filtered iterators for
317 // split, unsplit, and unsplittable splices based on the usage patterns.
318 iterator
begin() const { return SI
; }
319 iterator
end() const { return SJ
; }
322 /// \brief Get the sequence of split slice tails.
324 /// These tails are of slices which start before this partition but are
325 /// split and overlap into the partition. We accumulate these while forming
327 ArrayRef
<Slice
*> splitSliceTails() const { return SplitTails
; }
330 /// \brief An iterator over partitions of the alloca's slices.
332 /// This iterator implements the core algorithm for partitioning the alloca's
333 /// slices. It is a forward iterator as we don't support backtracking for
334 /// efficiency reasons, and re-use a single storage area to maintain the
335 /// current set of split slices.
337 /// It is templated on the slice iterator type to use so that it can operate
338 /// with either const or non-const slice iterators.
339 class partition_iterator
340 : public iterator_facade_base
<partition_iterator
,
341 std::forward_iterator_tag
, Partition
> {
342 friend class AllocaSlices
;
344 /// \brief Most of the state for walking the partitions is held in a class
345 /// with a nice interface for examining them.
348 /// \brief We need to keep the end of the slices to know when to stop.
349 AllocaSlices::iterator SE
;
351 /// \brief We also need to keep track of the maximum split end offset seen.
352 /// FIXME: Do we really?
353 uint64_t MaxSplitSliceEndOffset
;
355 /// \brief Sets the partition to be empty at given iterator, and sets the
357 partition_iterator(AllocaSlices::iterator SI
, AllocaSlices::iterator SE
)
358 : P(SI
), SE(SE
), MaxSplitSliceEndOffset(0) {
359 // If not already at the end, advance our state to form the initial
365 /// \brief Advance the iterator to the next partition.
367 /// Requires that the iterator not be at the end of the slices.
369 assert((P
.SI
!= SE
|| !P
.SplitTails
.empty()) &&
370 "Cannot advance past the end of the slices!");
372 // Clear out any split uses which have ended.
373 if (!P
.SplitTails
.empty()) {
374 if (P
.EndOffset
>= MaxSplitSliceEndOffset
) {
375 // If we've finished all splits, this is easy.
376 P
.SplitTails
.clear();
377 MaxSplitSliceEndOffset
= 0;
379 // Remove the uses which have ended in the prior partition. This
380 // cannot change the max split slice end because we just checked that
381 // the prior partition ended prior to that max.
384 P
.SplitTails
.begin(), P
.SplitTails
.end(),
385 [&](Slice
*S
) { return S
->endOffset() <= P
.EndOffset
; }),
387 assert(std::any_of(P
.SplitTails
.begin(), P
.SplitTails
.end(),
389 return S
->endOffset() == MaxSplitSliceEndOffset
;
391 "Could not find the current max split slice offset!");
392 assert(std::all_of(P
.SplitTails
.begin(), P
.SplitTails
.end(),
394 return S
->endOffset() <= MaxSplitSliceEndOffset
;
396 "Max split slice end offset is not actually the max!");
400 // If P.SI is already at the end, then we've cleared the split tail and
401 // now have an end iterator.
403 assert(P
.SplitTails
.empty() && "Failed to clear the split slices!");
407 // If we had a non-empty partition previously, set up the state for
408 // subsequent partitions.
410 // Accumulate all the splittable slices which started in the old
411 // partition into the split list.
413 if (S
.isSplittable() && S
.endOffset() > P
.EndOffset
) {
414 P
.SplitTails
.push_back(&S
);
415 MaxSplitSliceEndOffset
=
416 std::max(S
.endOffset(), MaxSplitSliceEndOffset
);
419 // Start from the end of the previous partition.
422 // If P.SI is now at the end, we at most have a tail of split slices.
424 P
.BeginOffset
= P
.EndOffset
;
425 P
.EndOffset
= MaxSplitSliceEndOffset
;
429 // If the we have split slices and the next slice is after a gap and is
430 // not splittable immediately form an empty partition for the split
431 // slices up until the next slice begins.
432 if (!P
.SplitTails
.empty() && P
.SI
->beginOffset() != P
.EndOffset
&&
433 !P
.SI
->isSplittable()) {
434 P
.BeginOffset
= P
.EndOffset
;
435 P
.EndOffset
= P
.SI
->beginOffset();
440 // OK, we need to consume new slices. Set the end offset based on the
441 // current slice, and step SJ past it. The beginning offset of the
442 // parttion is the beginning offset of the next slice unless we have
443 // pre-existing split slices that are continuing, in which case we begin
444 // at the prior end offset.
445 P
.BeginOffset
= P
.SplitTails
.empty() ? P
.SI
->beginOffset() : P
.EndOffset
;
446 P
.EndOffset
= P
.SI
->endOffset();
449 // There are two strategies to form a partition based on whether the
450 // partition starts with an unsplittable slice or a splittable slice.
451 if (!P
.SI
->isSplittable()) {
452 // When we're forming an unsplittable region, it must always start at
453 // the first slice and will extend through its end.
454 assert(P
.BeginOffset
== P
.SI
->beginOffset());
456 // Form a partition including all of the overlapping slices with this
457 // unsplittable slice.
458 while (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
) {
459 if (!P
.SJ
->isSplittable())
460 P
.EndOffset
= std::max(P
.EndOffset
, P
.SJ
->endOffset());
464 // We have a partition across a set of overlapping unsplittable
469 // If we're starting with a splittable slice, then we need to form
470 // a synthetic partition spanning it and any other overlapping splittable
472 assert(P
.SI
->isSplittable() && "Forming a splittable partition!");
474 // Collect all of the overlapping splittable slices.
475 while (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
&&
476 P
.SJ
->isSplittable()) {
477 P
.EndOffset
= std::max(P
.EndOffset
, P
.SJ
->endOffset());
481 // Back upiP.EndOffset if we ended the span early when encountering an
482 // unsplittable slice. This synthesizes the early end offset of
483 // a partition spanning only splittable slices.
484 if (P
.SJ
!= SE
&& P
.SJ
->beginOffset() < P
.EndOffset
) {
485 assert(!P
.SJ
->isSplittable());
486 P
.EndOffset
= P
.SJ
->beginOffset();
491 bool operator==(const partition_iterator
&RHS
) const {
492 assert(SE
== RHS
.SE
&&
493 "End iterators don't match between compared partition iterators!");
495 // The observed positions of partitions is marked by the P.SI iterator and
496 // the emptyness of the split slices. The latter is only relevant when
497 // P.SI == SE, as the end iterator will additionally have an empty split
498 // slices list, but the prior may have the same P.SI and a tail of split
500 if (P
.SI
== RHS
.P
.SI
&&
501 P
.SplitTails
.empty() == RHS
.P
.SplitTails
.empty()) {
502 assert(P
.SJ
== RHS
.P
.SJ
&&
503 "Same set of slices formed two different sized partitions!");
504 assert(P
.SplitTails
.size() == RHS
.P
.SplitTails
.size() &&
505 "Same slice position with differently sized non-empty split "
512 partition_iterator
&operator++() {
517 Partition
&operator*() { return P
; }
520 /// \brief A forward range over the partitions of the alloca's slices.
522 /// This accesses an iterator range over the partitions of the alloca's
523 /// slices. It computes these partitions on the fly based on the overlapping
524 /// offsets of the slices and the ability to split them. It will visit "empty"
525 /// partitions to cover regions of the alloca only accessed via split
527 iterator_range
<partition_iterator
> partitions() {
528 return make_range(partition_iterator(begin(), end()),
529 partition_iterator(end(), end()));
532 /// \brief Access the dead users for this alloca.
533 ArrayRef
<Instruction
*> getDeadUsers() const { return DeadUsers
; }
535 /// \brief Access the dead operands referring to this alloca.
537 /// These are operands which have cannot actually be used to refer to the
538 /// alloca as they are outside its range and the user doesn't correct for
539 /// that. These mostly consist of PHI node inputs and the like which we just
540 /// need to replace with undef.
541 ArrayRef
<Use
*> getDeadOperands() const { return DeadOperands
; }
543 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
544 void print(raw_ostream
&OS
, const_iterator I
, StringRef Indent
= " ") const;
545 void printSlice(raw_ostream
&OS
, const_iterator I
,
546 StringRef Indent
= " ") const;
547 void printUse(raw_ostream
&OS
, const_iterator I
,
548 StringRef Indent
= " ") const;
549 void print(raw_ostream
&OS
) const;
550 void dump(const_iterator I
) const;
555 template <typename DerivedT
, typename RetT
= void> class BuilderBase
;
557 friend class AllocaSlices::SliceBuilder
;
559 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
560 /// \brief Handle to alloca instruction to simplify method interfaces.
564 /// \brief The instruction responsible for this alloca not having a known set
567 /// When an instruction (potentially) escapes the pointer to the alloca, we
568 /// store a pointer to that here and abort trying to form slices of the
569 /// alloca. This will be null if the alloca slices are analyzed successfully.
570 Instruction
*PointerEscapingInstr
;
572 /// \brief The slices of the alloca.
574 /// We store a vector of the slices formed by uses of the alloca here. This
575 /// vector is sorted by increasing begin offset, and then the unsplittable
576 /// slices before the splittable ones. See the Slice inner class for more
578 SmallVector
<Slice
, 8> Slices
;
580 /// \brief Instructions which will become dead if we rewrite the alloca.
582 /// Note that these are not separated by slice. This is because we expect an
583 /// alloca to be completely rewritten or not rewritten at all. If rewritten,
584 /// all these instructions can simply be removed and replaced with undef as
585 /// they come from outside of the allocated space.
586 SmallVector
<Instruction
*, 8> DeadUsers
;
588 /// \brief Operands which will become dead if we rewrite the alloca.
590 /// These are operands that in their particular use can be replaced with
591 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
592 /// to PHI nodes and the like. They aren't entirely dead (there might be
593 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
594 /// want to swap this particular input for undef to simplify the use lists of
596 SmallVector
<Use
*, 8> DeadOperands
;
600 static Value
*foldSelectInst(SelectInst
&SI
) {
601 // If the condition being selected on is a constant or the same value is
602 // being selected between, fold the select. Yes this does (rarely) happen
604 if (ConstantInt
*CI
= dyn_cast
<ConstantInt
>(SI
.getCondition()))
605 return SI
.getOperand(1 + CI
->isZero());
606 if (SI
.getOperand(1) == SI
.getOperand(2))
607 return SI
.getOperand(1);
612 /// \brief A helper that folds a PHI node or a select.
613 static Value
*foldPHINodeOrSelectInst(Instruction
&I
) {
614 if (PHINode
*PN
= dyn_cast
<PHINode
>(&I
)) {
615 // If PN merges together the same value, return that value.
616 return PN
->hasConstantValue();
618 return foldSelectInst(cast
<SelectInst
>(I
));
621 /// \brief Builder for the alloca slices.
623 /// This class builds a set of alloca slices by recursively visiting the uses
624 /// of an alloca and making a slice for each load and store at each offset.
625 class AllocaSlices::SliceBuilder
: public PtrUseVisitor
<SliceBuilder
> {
626 friend class PtrUseVisitor
<SliceBuilder
>;
627 friend class InstVisitor
<SliceBuilder
>;
628 typedef PtrUseVisitor
<SliceBuilder
> Base
;
630 const uint64_t AllocSize
;
633 SmallDenseMap
<Instruction
*, unsigned> MemTransferSliceMap
;
634 SmallDenseMap
<Instruction
*, uint64_t> PHIOrSelectSizes
;
636 /// \brief Set to de-duplicate dead instructions found in the use walk.
637 SmallPtrSet
<Instruction
*, 4> VisitedDeadInsts
;
640 SliceBuilder(const DataLayout
&DL
, AllocaInst
&AI
, AllocaSlices
&AS
)
641 : PtrUseVisitor
<SliceBuilder
>(DL
),
642 AllocSize(DL
.getTypeAllocSize(AI
.getAllocatedType())), AS(AS
) {}
645 void markAsDead(Instruction
&I
) {
646 if (VisitedDeadInsts
.insert(&I
).second
)
647 AS
.DeadUsers
.push_back(&I
);
650 void insertUse(Instruction
&I
, const APInt
&Offset
, uint64_t Size
,
651 bool IsSplittable
= false) {
652 // Completely skip uses which have a zero size or start either before or
653 // past the end of the allocation.
654 if (Size
== 0 || Offset
.uge(AllocSize
)) {
655 DEBUG(dbgs() << "WARNING: Ignoring " << Size
<< " byte use @" << Offset
656 << " which has zero size or starts outside of the "
657 << AllocSize
<< " byte alloca:\n"
658 << " alloca: " << AS
.AI
<< "\n"
659 << " use: " << I
<< "\n");
660 return markAsDead(I
);
663 uint64_t BeginOffset
= Offset
.getZExtValue();
664 uint64_t EndOffset
= BeginOffset
+ Size
;
666 // Clamp the end offset to the end of the allocation. Note that this is
667 // formulated to handle even the case where "BeginOffset + Size" overflows.
668 // This may appear superficially to be something we could ignore entirely,
669 // but that is not so! There may be widened loads or PHI-node uses where
670 // some instructions are dead but not others. We can't completely ignore
671 // them, and so have to record at least the information here.
672 assert(AllocSize
>= BeginOffset
); // Established above.
673 if (Size
> AllocSize
- BeginOffset
) {
674 DEBUG(dbgs() << "WARNING: Clamping a " << Size
<< " byte use @" << Offset
675 << " to remain within the " << AllocSize
<< " byte alloca:\n"
676 << " alloca: " << AS
.AI
<< "\n"
677 << " use: " << I
<< "\n");
678 EndOffset
= AllocSize
;
681 AS
.Slices
.push_back(Slice(BeginOffset
, EndOffset
, U
, IsSplittable
));
684 void visitBitCastInst(BitCastInst
&BC
) {
686 return markAsDead(BC
);
688 return Base::visitBitCastInst(BC
);
691 void visitGetElementPtrInst(GetElementPtrInst
&GEPI
) {
692 if (GEPI
.use_empty())
693 return markAsDead(GEPI
);
695 if (SROAStrictInbounds
&& GEPI
.isInBounds()) {
696 // FIXME: This is a manually un-factored variant of the basic code inside
697 // of GEPs with checking of the inbounds invariant specified in the
698 // langref in a very strict sense. If we ever want to enable
699 // SROAStrictInbounds, this code should be factored cleanly into
700 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
701 // by writing out the code here where we have tho underlying allocation
702 // size readily available.
703 APInt GEPOffset
= Offset
;
704 for (gep_type_iterator GTI
= gep_type_begin(GEPI
),
705 GTE
= gep_type_end(GEPI
);
707 ConstantInt
*OpC
= dyn_cast
<ConstantInt
>(GTI
.getOperand());
711 // Handle a struct index, which adds its field offset to the pointer.
712 if (StructType
*STy
= dyn_cast
<StructType
>(*GTI
)) {
713 unsigned ElementIdx
= OpC
->getZExtValue();
714 const StructLayout
*SL
= DL
.getStructLayout(STy
);
716 APInt(Offset
.getBitWidth(), SL
->getElementOffset(ElementIdx
));
718 // For array or vector indices, scale the index by the size of the
720 APInt Index
= OpC
->getValue().sextOrTrunc(Offset
.getBitWidth());
721 GEPOffset
+= Index
* APInt(Offset
.getBitWidth(),
722 DL
.getTypeAllocSize(GTI
.getIndexedType()));
725 // If this index has computed an intermediate pointer which is not
726 // inbounds, then the result of the GEP is a poison value and we can
727 // delete it and all uses.
728 if (GEPOffset
.ugt(AllocSize
))
729 return markAsDead(GEPI
);
733 return Base::visitGetElementPtrInst(GEPI
);
736 void handleLoadOrStore(Type
*Ty
, Instruction
&I
, const APInt
&Offset
,
737 uint64_t Size
, bool IsVolatile
) {
738 // We allow splitting of non-volatile loads and stores where the type is an
739 // integer type. These may be used to implement 'memcpy' or other "transfer
740 // of bits" patterns.
741 bool IsSplittable
= Ty
->isIntegerTy() && !IsVolatile
;
743 insertUse(I
, Offset
, Size
, IsSplittable
);
746 void visitLoadInst(LoadInst
&LI
) {
747 assert((!LI
.isSimple() || LI
.getType()->isSingleValueType()) &&
748 "All simple FCA loads should have been pre-split");
751 return PI
.setAborted(&LI
);
753 uint64_t Size
= DL
.getTypeStoreSize(LI
.getType());
754 return handleLoadOrStore(LI
.getType(), LI
, Offset
, Size
, LI
.isVolatile());
757 void visitStoreInst(StoreInst
&SI
) {
758 Value
*ValOp
= SI
.getValueOperand();
760 return PI
.setEscapedAndAborted(&SI
);
762 return PI
.setAborted(&SI
);
764 uint64_t Size
= DL
.getTypeStoreSize(ValOp
->getType());
766 // If this memory access can be shown to *statically* extend outside the
767 // bounds of of the allocation, it's behavior is undefined, so simply
768 // ignore it. Note that this is more strict than the generic clamping
769 // behavior of insertUse. We also try to handle cases which might run the
771 // FIXME: We should instead consider the pointer to have escaped if this
772 // function is being instrumented for addressing bugs or race conditions.
773 if (Size
> AllocSize
|| Offset
.ugt(AllocSize
- Size
)) {
774 DEBUG(dbgs() << "WARNING: Ignoring " << Size
<< " byte store @" << Offset
775 << " which extends past the end of the " << AllocSize
777 << " alloca: " << AS
.AI
<< "\n"
778 << " use: " << SI
<< "\n");
779 return markAsDead(SI
);
782 assert((!SI
.isSimple() || ValOp
->getType()->isSingleValueType()) &&
783 "All simple FCA stores should have been pre-split");
784 handleLoadOrStore(ValOp
->getType(), SI
, Offset
, Size
, SI
.isVolatile());
787 void visitMemSetInst(MemSetInst
&II
) {
788 assert(II
.getRawDest() == *U
&& "Pointer use is not the destination?");
789 ConstantInt
*Length
= dyn_cast
<ConstantInt
>(II
.getLength());
790 if ((Length
&& Length
->getValue() == 0) ||
791 (IsOffsetKnown
&& Offset
.uge(AllocSize
)))
792 // Zero-length mem transfer intrinsics can be ignored entirely.
793 return markAsDead(II
);
796 return PI
.setAborted(&II
);
798 insertUse(II
, Offset
, Length
? Length
->getLimitedValue()
799 : AllocSize
- Offset
.getLimitedValue(),
803 void visitMemTransferInst(MemTransferInst
&II
) {
804 ConstantInt
*Length
= dyn_cast
<ConstantInt
>(II
.getLength());
805 if (Length
&& Length
->getValue() == 0)
806 // Zero-length mem transfer intrinsics can be ignored entirely.
807 return markAsDead(II
);
809 // Because we can visit these intrinsics twice, also check to see if the
810 // first time marked this instruction as dead. If so, skip it.
811 if (VisitedDeadInsts
.count(&II
))
815 return PI
.setAborted(&II
);
817 // This side of the transfer is completely out-of-bounds, and so we can
818 // nuke the entire transfer. However, we also need to nuke the other side
819 // if already added to our partitions.
820 // FIXME: Yet another place we really should bypass this when
821 // instrumenting for ASan.
822 if (Offset
.uge(AllocSize
)) {
823 SmallDenseMap
<Instruction
*, unsigned>::iterator MTPI
=
824 MemTransferSliceMap
.find(&II
);
825 if (MTPI
!= MemTransferSliceMap
.end())
826 AS
.Slices
[MTPI
->second
].kill();
827 return markAsDead(II
);
830 uint64_t RawOffset
= Offset
.getLimitedValue();
831 uint64_t Size
= Length
? Length
->getLimitedValue() : AllocSize
- RawOffset
;
833 // Check for the special case where the same exact value is used for both
835 if (*U
== II
.getRawDest() && *U
== II
.getRawSource()) {
836 // For non-volatile transfers this is a no-op.
837 if (!II
.isVolatile())
838 return markAsDead(II
);
840 return insertUse(II
, Offset
, Size
, /*IsSplittable=*/false);
843 // If we have seen both source and destination for a mem transfer, then
844 // they both point to the same alloca.
846 SmallDenseMap
<Instruction
*, unsigned>::iterator MTPI
;
847 std::tie(MTPI
, Inserted
) =
848 MemTransferSliceMap
.insert(std::make_pair(&II
, AS
.Slices
.size()));
849 unsigned PrevIdx
= MTPI
->second
;
851 Slice
&PrevP
= AS
.Slices
[PrevIdx
];
853 // Check if the begin offsets match and this is a non-volatile transfer.
854 // In that case, we can completely elide the transfer.
855 if (!II
.isVolatile() && PrevP
.beginOffset() == RawOffset
) {
857 return markAsDead(II
);
860 // Otherwise we have an offset transfer within the same alloca. We can't
862 PrevP
.makeUnsplittable();
865 // Insert the use now that we've fixed up the splittable nature.
866 insertUse(II
, Offset
, Size
, /*IsSplittable=*/Inserted
&& Length
);
868 // Check that we ended up with a valid index in the map.
869 assert(AS
.Slices
[PrevIdx
].getUse()->getUser() == &II
&&
870 "Map index doesn't point back to a slice with this user.");
873 // Disable SRoA for any intrinsics except for lifetime invariants.
874 // FIXME: What about debug intrinsics? This matches old behavior, but
875 // doesn't make sense.
876 void visitIntrinsicInst(IntrinsicInst
&II
) {
878 return PI
.setAborted(&II
);
880 if (II
.getIntrinsicID() == Intrinsic::lifetime_start
||
881 II
.getIntrinsicID() == Intrinsic::lifetime_end
) {
882 ConstantInt
*Length
= cast
<ConstantInt
>(II
.getArgOperand(0));
883 uint64_t Size
= std::min(AllocSize
- Offset
.getLimitedValue(),
884 Length
->getLimitedValue());
885 insertUse(II
, Offset
, Size
, true);
889 Base::visitIntrinsicInst(II
);
892 Instruction
*hasUnsafePHIOrSelectUse(Instruction
*Root
, uint64_t &Size
) {
893 // We consider any PHI or select that results in a direct load or store of
894 // the same offset to be a viable use for slicing purposes. These uses
895 // are considered unsplittable and the size is the maximum loaded or stored
897 SmallPtrSet
<Instruction
*, 4> Visited
;
898 SmallVector
<std::pair
<Instruction
*, Instruction
*>, 4> Uses
;
899 Visited
.insert(Root
);
900 Uses
.push_back(std::make_pair(cast
<Instruction
>(*U
), Root
));
901 // If there are no loads or stores, the access is dead. We mark that as
902 // a size zero access.
905 Instruction
*I
, *UsedI
;
906 std::tie(UsedI
, I
) = Uses
.pop_back_val();
908 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
)) {
909 Size
= std::max(Size
, DL
.getTypeStoreSize(LI
->getType()));
912 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
913 Value
*Op
= SI
->getOperand(0);
916 Size
= std::max(Size
, DL
.getTypeStoreSize(Op
->getType()));
920 if (GetElementPtrInst
*GEP
= dyn_cast
<GetElementPtrInst
>(I
)) {
921 if (!GEP
->hasAllZeroIndices())
923 } else if (!isa
<BitCastInst
>(I
) && !isa
<PHINode
>(I
) &&
924 !isa
<SelectInst
>(I
)) {
928 for (User
*U
: I
->users())
929 if (Visited
.insert(cast
<Instruction
>(U
)).second
)
930 Uses
.push_back(std::make_pair(I
, cast
<Instruction
>(U
)));
931 } while (!Uses
.empty());
936 void visitPHINodeOrSelectInst(Instruction
&I
) {
937 assert(isa
<PHINode
>(I
) || isa
<SelectInst
>(I
));
939 return markAsDead(I
);
941 // TODO: We could use SimplifyInstruction here to fold PHINodes and
942 // SelectInsts. However, doing so requires to change the current
943 // dead-operand-tracking mechanism. For instance, suppose neither loading
944 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
945 // trap either. However, if we simply replace %U with undef using the
946 // current dead-operand-tracking mechanism, "load (select undef, undef,
947 // %other)" may trap because the select may return the first operand
949 if (Value
*Result
= foldPHINodeOrSelectInst(I
)) {
951 // If the result of the constant fold will be the pointer, recurse
952 // through the PHI/select as if we had RAUW'ed it.
955 // Otherwise the operand to the PHI/select is dead, and we can replace
957 AS
.DeadOperands
.push_back(U
);
963 return PI
.setAborted(&I
);
965 // See if we already have computed info on this node.
966 uint64_t &Size
= PHIOrSelectSizes
[&I
];
968 // This is a new PHI/Select, check for an unsafe use of it.
969 if (Instruction
*UnsafeI
= hasUnsafePHIOrSelectUse(&I
, Size
))
970 return PI
.setAborted(UnsafeI
);
973 // For PHI and select operands outside the alloca, we can't nuke the entire
974 // phi or select -- the other side might still be relevant, so we special
975 // case them here and use a separate structure to track the operands
976 // themselves which should be replaced with undef.
977 // FIXME: This should instead be escaped in the event we're instrumenting
978 // for address sanitization.
979 if (Offset
.uge(AllocSize
)) {
980 AS
.DeadOperands
.push_back(U
);
984 insertUse(I
, Offset
, Size
);
987 void visitPHINode(PHINode
&PN
) { visitPHINodeOrSelectInst(PN
); }
989 void visitSelectInst(SelectInst
&SI
) { visitPHINodeOrSelectInst(SI
); }
991 /// \brief Disable SROA entirely if there are unhandled users of the alloca.
992 void visitInstruction(Instruction
&I
) { PI
.setAborted(&I
); }
995 AllocaSlices::AllocaSlices(const DataLayout
&DL
, AllocaInst
&AI
)
997 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1000 PointerEscapingInstr(nullptr) {
1001 SliceBuilder
PB(DL
, AI
, *this);
1002 SliceBuilder::PtrInfo PtrI
= PB
.visitPtr(AI
);
1003 if (PtrI
.isEscaped() || PtrI
.isAborted()) {
1004 // FIXME: We should sink the escape vs. abort info into the caller nicely,
1005 // possibly by just storing the PtrInfo in the AllocaSlices.
1006 PointerEscapingInstr
= PtrI
.getEscapingInst() ? PtrI
.getEscapingInst()
1007 : PtrI
.getAbortingInst();
1008 assert(PointerEscapingInstr
&& "Did not track a bad instruction");
1012 Slices
.erase(std::remove_if(Slices
.begin(), Slices
.end(),
1013 [](const Slice
&S
) {
1018 #if __cplusplus >= 201103L && !defined(NDEBUG)
1019 if (SROARandomShuffleSlices
) {
1020 std::mt19937
MT(static_cast<unsigned>(sys::TimeValue::now().msec()));
1021 std::shuffle(Slices
.begin(), Slices
.end(), MT
);
1025 // Sort the uses. This arranges for the offsets to be in ascending order,
1026 // and the sizes to be in descending order.
1027 std::sort(Slices
.begin(), Slices
.end());
1030 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1032 void AllocaSlices::print(raw_ostream
&OS
, const_iterator I
,
1033 StringRef Indent
) const {
1034 printSlice(OS
, I
, Indent
);
1036 printUse(OS
, I
, Indent
);
1039 void AllocaSlices::printSlice(raw_ostream
&OS
, const_iterator I
,
1040 StringRef Indent
) const {
1041 OS
<< Indent
<< "[" << I
->beginOffset() << "," << I
->endOffset() << ")"
1042 << " slice #" << (I
- begin())
1043 << (I
->isSplittable() ? " (splittable)" : "");
1046 void AllocaSlices::printUse(raw_ostream
&OS
, const_iterator I
,
1047 StringRef Indent
) const {
1048 OS
<< Indent
<< " used by: " << *I
->getUse()->getUser() << "\n";
1051 void AllocaSlices::print(raw_ostream
&OS
) const {
1052 if (PointerEscapingInstr
) {
1053 OS
<< "Can't analyze slices for alloca: " << AI
<< "\n"
1054 << " A pointer to this alloca escaped by:\n"
1055 << " " << *PointerEscapingInstr
<< "\n";
1059 OS
<< "Slices of alloca: " << AI
<< "\n";
1060 for (const_iterator I
= begin(), E
= end(); I
!= E
; ++I
)
1064 LLVM_DUMP_METHOD
void AllocaSlices::dump(const_iterator I
) const {
1067 LLVM_DUMP_METHOD
void AllocaSlices::dump() const { print(dbgs()); }
1069 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1072 /// \brief Implementation of LoadAndStorePromoter for promoting allocas.
1074 /// This subclass of LoadAndStorePromoter adds overrides to handle promoting
1075 /// the loads and stores of an alloca instruction, as well as updating its
1076 /// debug information. This is used when a domtree is unavailable and thus
1077 /// mem2reg in its full form can't be used to handle promotion of allocas to
1079 class AllocaPromoter
: public LoadAndStorePromoter
{
1083 SmallVector
<DbgDeclareInst
*, 4> DDIs
;
1084 SmallVector
<DbgValueInst
*, 4> DVIs
;
1087 AllocaPromoter(const SmallVectorImpl
<Instruction
*> &Insts
, SSAUpdater
&S
,
1088 AllocaInst
&AI
, DIBuilder
&DIB
)
1089 : LoadAndStorePromoter(Insts
, S
), AI(AI
), DIB(DIB
) {}
1091 void run(const SmallVectorImpl
<Instruction
*> &Insts
) {
1092 // Retain the debug information attached to the alloca for use when
1093 // rewriting loads and stores.
1094 if (auto *L
= LocalAsMetadata::getIfExists(&AI
)) {
1095 if (auto *DebugNode
= MetadataAsValue::getIfExists(AI
.getContext(), L
)) {
1096 for (User
*U
: DebugNode
->users())
1097 if (DbgDeclareInst
*DDI
= dyn_cast
<DbgDeclareInst
>(U
))
1098 DDIs
.push_back(DDI
);
1099 else if (DbgValueInst
*DVI
= dyn_cast
<DbgValueInst
>(U
))
1100 DVIs
.push_back(DVI
);
1104 LoadAndStorePromoter::run(Insts
);
1106 // While we have the debug information, clear it off of the alloca. The
1107 // caller takes care of deleting the alloca.
1108 while (!DDIs
.empty())
1109 DDIs
.pop_back_val()->eraseFromParent();
1110 while (!DVIs
.empty())
1111 DVIs
.pop_back_val()->eraseFromParent();
1115 isInstInList(Instruction
*I
,
1116 const SmallVectorImpl
<Instruction
*> &Insts
) const override
{
1118 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
))
1119 Ptr
= LI
->getOperand(0);
1121 Ptr
= cast
<StoreInst
>(I
)->getPointerOperand();
1123 // Only used to detect cycles, which will be rare and quickly found as
1124 // we're walking up a chain of defs rather than down through uses.
1125 SmallPtrSet
<Value
*, 4> Visited
;
1131 if (BitCastInst
*BCI
= dyn_cast
<BitCastInst
>(Ptr
))
1132 Ptr
= BCI
->getOperand(0);
1133 else if (GetElementPtrInst
*GEPI
= dyn_cast
<GetElementPtrInst
>(Ptr
))
1134 Ptr
= GEPI
->getPointerOperand();
1138 } while (Visited
.insert(Ptr
).second
);
1143 void updateDebugInfo(Instruction
*Inst
) const override
{
1144 for (DbgDeclareInst
*DDI
: DDIs
)
1145 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(Inst
))
1146 ConvertDebugDeclareToDebugValue(DDI
, SI
, DIB
);
1147 else if (LoadInst
*LI
= dyn_cast
<LoadInst
>(Inst
))
1148 ConvertDebugDeclareToDebugValue(DDI
, LI
, DIB
);
1149 for (DbgValueInst
*DVI
: DVIs
) {
1150 Value
*Arg
= nullptr;
1151 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(Inst
)) {
1152 // If an argument is zero extended then use argument directly. The ZExt
1153 // may be zapped by an optimization pass in future.
1154 if (ZExtInst
*ZExt
= dyn_cast
<ZExtInst
>(SI
->getOperand(0)))
1155 Arg
= dyn_cast
<Argument
>(ZExt
->getOperand(0));
1156 else if (SExtInst
*SExt
= dyn_cast
<SExtInst
>(SI
->getOperand(0)))
1157 Arg
= dyn_cast
<Argument
>(SExt
->getOperand(0));
1159 Arg
= SI
->getValueOperand();
1160 } else if (LoadInst
*LI
= dyn_cast
<LoadInst
>(Inst
)) {
1161 Arg
= LI
->getPointerOperand();
1165 Instruction
*DbgVal
=
1166 DIB
.insertDbgValueIntrinsic(Arg
, 0, DIVariable(DVI
->getVariable()),
1167 DIExpression(DVI
->getExpression()), Inst
);
1168 DbgVal
->setDebugLoc(DVI
->getDebugLoc());
1172 } // end anon namespace
1175 /// \brief An optimization pass providing Scalar Replacement of Aggregates.
1177 /// This pass takes allocations which can be completely analyzed (that is, they
1178 /// don't escape) and tries to turn them into scalar SSA values. There are
1179 /// a few steps to this process.
1181 /// 1) It takes allocations of aggregates and analyzes the ways in which they
1182 /// are used to try to split them into smaller allocations, ideally of
1183 /// a single scalar data type. It will split up memcpy and memset accesses
1184 /// as necessary and try to isolate individual scalar accesses.
1185 /// 2) It will transform accesses into forms which are suitable for SSA value
1186 /// promotion. This can be replacing a memset with a scalar store of an
1187 /// integer value, or it can involve speculating operations on a PHI or
1188 /// select to be a PHI or select of the results.
1189 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly
1190 /// onto insert and extract operations on a vector value, and convert them to
1191 /// this form. By doing so, it will enable promotion of vector aggregates to
1192 /// SSA vector values.
1193 class SROA
: public FunctionPass
{
1194 const bool RequiresDomTree
;
1197 const DataLayout
*DL
;
1199 AssumptionCache
*AC
;
1201 /// \brief Worklist of alloca instructions to simplify.
1203 /// Each alloca in the function is added to this. Each new alloca formed gets
1204 /// added to it as well to recursively simplify unless that alloca can be
1205 /// directly promoted. Finally, each time we rewrite a use of an alloca other
1206 /// the one being actively rewritten, we add it back onto the list if not
1207 /// already present to ensure it is re-visited.
1208 SetVector
<AllocaInst
*, SmallVector
<AllocaInst
*, 16>> Worklist
;
1210 /// \brief A collection of instructions to delete.
1211 /// We try to batch deletions to simplify code and make things a bit more
1213 SetVector
<Instruction
*, SmallVector
<Instruction
*, 8>> DeadInsts
;
1215 /// \brief Post-promotion worklist.
1217 /// Sometimes we discover an alloca which has a high probability of becoming
1218 /// viable for SROA after a round of promotion takes place. In those cases,
1219 /// the alloca is enqueued here for re-processing.
1221 /// Note that we have to be very careful to clear allocas out of this list in
1222 /// the event they are deleted.
1223 SetVector
<AllocaInst
*, SmallVector
<AllocaInst
*, 16>> PostPromotionWorklist
;
1225 /// \brief A collection of alloca instructions we can directly promote.
1226 std::vector
<AllocaInst
*> PromotableAllocas
;
1228 /// \brief A worklist of PHIs to speculate prior to promoting allocas.
1230 /// All of these PHIs have been checked for the safety of speculation and by
1231 /// being speculated will allow promoting allocas currently in the promotable
1233 SetVector
<PHINode
*, SmallVector
<PHINode
*, 2>> SpeculatablePHIs
;
1235 /// \brief A worklist of select instructions to speculate prior to promoting
1238 /// All of these select instructions have been checked for the safety of
1239 /// speculation and by being speculated will allow promoting allocas
1240 /// currently in the promotable queue.
1241 SetVector
<SelectInst
*, SmallVector
<SelectInst
*, 2>> SpeculatableSelects
;
1244 SROA(bool RequiresDomTree
= true)
1245 : FunctionPass(ID
), RequiresDomTree(RequiresDomTree
), C(nullptr),
1246 DL(nullptr), DT(nullptr) {
1247 initializeSROAPass(*PassRegistry::getPassRegistry());
1249 bool runOnFunction(Function
&F
) override
;
1250 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
1252 const char *getPassName() const override
{ return "SROA"; }
1256 friend class PHIOrSelectSpeculator
;
1257 friend class AllocaSliceRewriter
;
1259 bool presplitLoadsAndStores(AllocaInst
&AI
, AllocaSlices
&AS
);
1260 bool rewritePartition(AllocaInst
&AI
, AllocaSlices
&AS
,
1261 AllocaSlices::Partition
&P
);
1262 bool splitAlloca(AllocaInst
&AI
, AllocaSlices
&AS
);
1263 bool runOnAlloca(AllocaInst
&AI
);
1264 void clobberUse(Use
&U
);
1265 void deleteDeadInstructions(SmallPtrSetImpl
<AllocaInst
*> &DeletedAllocas
);
1266 bool promoteAllocas(Function
&F
);
1272 FunctionPass
*llvm::createSROAPass(bool RequiresDomTree
) {
1273 return new SROA(RequiresDomTree
);
1276 INITIALIZE_PASS_BEGIN(SROA
, "sroa", "Scalar Replacement Of Aggregates", false,
1278 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker
)
1279 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass
)
1280 INITIALIZE_PASS_END(SROA
, "sroa", "Scalar Replacement Of Aggregates", false,
1283 /// Walk the range of a partitioning looking for a common type to cover this
1284 /// sequence of slices.
1285 static Type
*findCommonType(AllocaSlices::const_iterator B
,
1286 AllocaSlices::const_iterator E
,
1287 uint64_t EndOffset
) {
1289 bool TyIsCommon
= true;
1290 IntegerType
*ITy
= nullptr;
1292 // Note that we need to look at *every* alloca slice's Use to ensure we
1293 // always get consistent results regardless of the order of slices.
1294 for (AllocaSlices::const_iterator I
= B
; I
!= E
; ++I
) {
1295 Use
*U
= I
->getUse();
1296 if (isa
<IntrinsicInst
>(*U
->getUser()))
1298 if (I
->beginOffset() != B
->beginOffset() || I
->endOffset() != EndOffset
)
1301 Type
*UserTy
= nullptr;
1302 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
1303 UserTy
= LI
->getType();
1304 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
1305 UserTy
= SI
->getValueOperand()->getType();
1308 if (IntegerType
*UserITy
= dyn_cast_or_null
<IntegerType
>(UserTy
)) {
1309 // If the type is larger than the partition, skip it. We only encounter
1310 // this for split integer operations where we want to use the type of the
1311 // entity causing the split. Also skip if the type is not a byte width
1313 if (UserITy
->getBitWidth() % 8 != 0 ||
1314 UserITy
->getBitWidth() / 8 > (EndOffset
- B
->beginOffset()))
1317 // Track the largest bitwidth integer type used in this way in case there
1318 // is no common type.
1319 if (!ITy
|| ITy
->getBitWidth() < UserITy
->getBitWidth())
1323 // To avoid depending on the order of slices, Ty and TyIsCommon must not
1324 // depend on types skipped above.
1325 if (!UserTy
|| (Ty
&& Ty
!= UserTy
))
1326 TyIsCommon
= false; // Give up on anything but an iN type.
1331 return TyIsCommon
? Ty
: ITy
;
1334 /// PHI instructions that use an alloca and are subsequently loaded can be
1335 /// rewritten to load both input pointers in the pred blocks and then PHI the
1336 /// results, allowing the load of the alloca to be promoted.
1338 /// %P2 = phi [i32* %Alloca, i32* %Other]
1339 /// %V = load i32* %P2
1341 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1343 /// %V2 = load i32* %Other
1345 /// %V = phi [i32 %V1, i32 %V2]
1347 /// We can do this to a select if its only uses are loads and if the operands
1348 /// to the select can be loaded unconditionally.
1350 /// FIXME: This should be hoisted into a generic utility, likely in
1351 /// Transforms/Util/Local.h
1352 static bool isSafePHIToSpeculate(PHINode
&PN
, const DataLayout
*DL
= nullptr) {
1353 // For now, we can only do this promotion if the load is in the same block
1354 // as the PHI, and if there are no stores between the phi and load.
1355 // TODO: Allow recursive phi users.
1356 // TODO: Allow stores.
1357 BasicBlock
*BB
= PN
.getParent();
1358 unsigned MaxAlign
= 0;
1359 bool HaveLoad
= false;
1360 for (User
*U
: PN
.users()) {
1361 LoadInst
*LI
= dyn_cast
<LoadInst
>(U
);
1362 if (!LI
|| !LI
->isSimple())
1365 // For now we only allow loads in the same block as the PHI. This is
1366 // a common case that happens when instcombine merges two loads through
1368 if (LI
->getParent() != BB
)
1371 // Ensure that there are no instructions between the PHI and the load that
1373 for (BasicBlock::iterator BBI
= &PN
; &*BBI
!= LI
; ++BBI
)
1374 if (BBI
->mayWriteToMemory())
1377 MaxAlign
= std::max(MaxAlign
, LI
->getAlignment());
1384 // We can only transform this if it is safe to push the loads into the
1385 // predecessor blocks. The only thing to watch out for is that we can't put
1386 // a possibly trapping load in the predecessor if it is a critical edge.
1387 for (unsigned Idx
= 0, Num
= PN
.getNumIncomingValues(); Idx
!= Num
; ++Idx
) {
1388 TerminatorInst
*TI
= PN
.getIncomingBlock(Idx
)->getTerminator();
1389 Value
*InVal
= PN
.getIncomingValue(Idx
);
1391 // If the value is produced by the terminator of the predecessor (an
1392 // invoke) or it has side-effects, there is no valid place to put a load
1393 // in the predecessor.
1394 if (TI
== InVal
|| TI
->mayHaveSideEffects())
1397 // If the predecessor has a single successor, then the edge isn't
1399 if (TI
->getNumSuccessors() == 1)
1402 // If this pointer is always safe to load, or if we can prove that there
1403 // is already a load in the block, then we can move the load to the pred
1405 if (InVal
->isDereferenceablePointer(DL
) ||
1406 isSafeToLoadUnconditionally(InVal
, TI
, MaxAlign
, DL
))
1415 static void speculatePHINodeLoads(PHINode
&PN
) {
1416 DEBUG(dbgs() << " original: " << PN
<< "\n");
1418 Type
*LoadTy
= cast
<PointerType
>(PN
.getType())->getElementType();
1419 IRBuilderTy
PHIBuilder(&PN
);
1420 PHINode
*NewPN
= PHIBuilder
.CreatePHI(LoadTy
, PN
.getNumIncomingValues(),
1421 PN
.getName() + ".sroa.speculated");
1423 // Get the AA tags and alignment to use from one of the loads. It doesn't
1424 // matter which one we get and if any differ.
1425 LoadInst
*SomeLoad
= cast
<LoadInst
>(PN
.user_back());
1428 SomeLoad
->getAAMetadata(AATags
);
1429 unsigned Align
= SomeLoad
->getAlignment();
1431 // Rewrite all loads of the PN to use the new PHI.
1432 while (!PN
.use_empty()) {
1433 LoadInst
*LI
= cast
<LoadInst
>(PN
.user_back());
1434 LI
->replaceAllUsesWith(NewPN
);
1435 LI
->eraseFromParent();
1438 // Inject loads into all of the pred blocks.
1439 for (unsigned Idx
= 0, Num
= PN
.getNumIncomingValues(); Idx
!= Num
; ++Idx
) {
1440 BasicBlock
*Pred
= PN
.getIncomingBlock(Idx
);
1441 TerminatorInst
*TI
= Pred
->getTerminator();
1442 Value
*InVal
= PN
.getIncomingValue(Idx
);
1443 IRBuilderTy
PredBuilder(TI
);
1445 LoadInst
*Load
= PredBuilder
.CreateLoad(
1446 InVal
, (PN
.getName() + ".sroa.speculate.load." + Pred
->getName()));
1447 ++NumLoadsSpeculated
;
1448 Load
->setAlignment(Align
);
1450 Load
->setAAMetadata(AATags
);
1451 NewPN
->addIncoming(Load
, Pred
);
1454 DEBUG(dbgs() << " speculated to: " << *NewPN
<< "\n");
1455 PN
.eraseFromParent();
1458 /// Select instructions that use an alloca and are subsequently loaded can be
1459 /// rewritten to load both input pointers and then select between the result,
1460 /// allowing the load of the alloca to be promoted.
1462 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
1463 /// %V = load i32* %P2
1465 /// %V1 = load i32* %Alloca -> will be mem2reg'd
1466 /// %V2 = load i32* %Other
1467 /// %V = select i1 %cond, i32 %V1, i32 %V2
1469 /// We can do this to a select if its only uses are loads and if the operand
1470 /// to the select can be loaded unconditionally.
1471 static bool isSafeSelectToSpeculate(SelectInst
&SI
,
1472 const DataLayout
*DL
= nullptr) {
1473 Value
*TValue
= SI
.getTrueValue();
1474 Value
*FValue
= SI
.getFalseValue();
1475 bool TDerefable
= TValue
->isDereferenceablePointer(DL
);
1476 bool FDerefable
= FValue
->isDereferenceablePointer(DL
);
1478 for (User
*U
: SI
.users()) {
1479 LoadInst
*LI
= dyn_cast
<LoadInst
>(U
);
1480 if (!LI
|| !LI
->isSimple())
1483 // Both operands to the select need to be dereferencable, either
1484 // absolutely (e.g. allocas) or at this point because we can see other
1487 !isSafeToLoadUnconditionally(TValue
, LI
, LI
->getAlignment(), DL
))
1490 !isSafeToLoadUnconditionally(FValue
, LI
, LI
->getAlignment(), DL
))
1497 static void speculateSelectInstLoads(SelectInst
&SI
) {
1498 DEBUG(dbgs() << " original: " << SI
<< "\n");
1500 IRBuilderTy
IRB(&SI
);
1501 Value
*TV
= SI
.getTrueValue();
1502 Value
*FV
= SI
.getFalseValue();
1503 // Replace the loads of the select with a select of two loads.
1504 while (!SI
.use_empty()) {
1505 LoadInst
*LI
= cast
<LoadInst
>(SI
.user_back());
1506 assert(LI
->isSimple() && "We only speculate simple loads");
1508 IRB
.SetInsertPoint(LI
);
1510 IRB
.CreateLoad(TV
, LI
->getName() + ".sroa.speculate.load.true");
1512 IRB
.CreateLoad(FV
, LI
->getName() + ".sroa.speculate.load.false");
1513 NumLoadsSpeculated
+= 2;
1515 // Transfer alignment and AA info if present.
1516 TL
->setAlignment(LI
->getAlignment());
1517 FL
->setAlignment(LI
->getAlignment());
1520 LI
->getAAMetadata(Tags
);
1522 TL
->setAAMetadata(Tags
);
1523 FL
->setAAMetadata(Tags
);
1526 Value
*V
= IRB
.CreateSelect(SI
.getCondition(), TL
, FL
,
1527 LI
->getName() + ".sroa.speculated");
1529 DEBUG(dbgs() << " speculated to: " << *V
<< "\n");
1530 LI
->replaceAllUsesWith(V
);
1531 LI
->eraseFromParent();
1533 SI
.eraseFromParent();
1536 /// \brief Build a GEP out of a base pointer and indices.
1538 /// This will return the BasePtr if that is valid, or build a new GEP
1539 /// instruction using the IRBuilder if GEP-ing is needed.
1540 static Value
*buildGEP(IRBuilderTy
&IRB
, Value
*BasePtr
,
1541 SmallVectorImpl
<Value
*> &Indices
, Twine NamePrefix
) {
1542 if (Indices
.empty())
1545 // A single zero index is a no-op, so check for this and avoid building a GEP
1547 if (Indices
.size() == 1 && cast
<ConstantInt
>(Indices
.back())->isZero())
1550 return IRB
.CreateInBoundsGEP(BasePtr
, Indices
, NamePrefix
+ "sroa_idx");
1553 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
1554 /// TargetTy without changing the offset of the pointer.
1556 /// This routine assumes we've already established a properly offset GEP with
1557 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with
1558 /// zero-indices down through type layers until we find one the same as
1559 /// TargetTy. If we can't find one with the same type, we at least try to use
1560 /// one with the same size. If none of that works, we just produce the GEP as
1561 /// indicated by Indices to have the correct offset.
1562 static Value
*getNaturalGEPWithType(IRBuilderTy
&IRB
, const DataLayout
&DL
,
1563 Value
*BasePtr
, Type
*Ty
, Type
*TargetTy
,
1564 SmallVectorImpl
<Value
*> &Indices
,
1567 return buildGEP(IRB
, BasePtr
, Indices
, NamePrefix
);
1569 // Pointer size to use for the indices.
1570 unsigned PtrSize
= DL
.getPointerTypeSizeInBits(BasePtr
->getType());
1572 // See if we can descend into a struct and locate a field with the correct
1574 unsigned NumLayers
= 0;
1575 Type
*ElementTy
= Ty
;
1577 if (ElementTy
->isPointerTy())
1580 if (ArrayType
*ArrayTy
= dyn_cast
<ArrayType
>(ElementTy
)) {
1581 ElementTy
= ArrayTy
->getElementType();
1582 Indices
.push_back(IRB
.getIntN(PtrSize
, 0));
1583 } else if (VectorType
*VectorTy
= dyn_cast
<VectorType
>(ElementTy
)) {
1584 ElementTy
= VectorTy
->getElementType();
1585 Indices
.push_back(IRB
.getInt32(0));
1586 } else if (StructType
*STy
= dyn_cast
<StructType
>(ElementTy
)) {
1587 if (STy
->element_begin() == STy
->element_end())
1588 break; // Nothing left to descend into.
1589 ElementTy
= *STy
->element_begin();
1590 Indices
.push_back(IRB
.getInt32(0));
1595 } while (ElementTy
!= TargetTy
);
1596 if (ElementTy
!= TargetTy
)
1597 Indices
.erase(Indices
.end() - NumLayers
, Indices
.end());
1599 return buildGEP(IRB
, BasePtr
, Indices
, NamePrefix
);
1602 /// \brief Recursively compute indices for a natural GEP.
1604 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
1605 /// element types adding appropriate indices for the GEP.
1606 static Value
*getNaturalGEPRecursively(IRBuilderTy
&IRB
, const DataLayout
&DL
,
1607 Value
*Ptr
, Type
*Ty
, APInt
&Offset
,
1609 SmallVectorImpl
<Value
*> &Indices
,
1612 return getNaturalGEPWithType(IRB
, DL
, Ptr
, Ty
, TargetTy
, Indices
,
1615 // We can't recurse through pointer types.
1616 if (Ty
->isPointerTy())
1619 // We try to analyze GEPs over vectors here, but note that these GEPs are
1620 // extremely poorly defined currently. The long-term goal is to remove GEPing
1621 // over a vector from the IR completely.
1622 if (VectorType
*VecTy
= dyn_cast
<VectorType
>(Ty
)) {
1623 unsigned ElementSizeInBits
= DL
.getTypeSizeInBits(VecTy
->getScalarType());
1624 if (ElementSizeInBits
% 8 != 0) {
1625 // GEPs over non-multiple of 8 size vector elements are invalid.
1628 APInt
ElementSize(Offset
.getBitWidth(), ElementSizeInBits
/ 8);
1629 APInt NumSkippedElements
= Offset
.sdiv(ElementSize
);
1630 if (NumSkippedElements
.ugt(VecTy
->getNumElements()))
1632 Offset
-= NumSkippedElements
* ElementSize
;
1633 Indices
.push_back(IRB
.getInt(NumSkippedElements
));
1634 return getNaturalGEPRecursively(IRB
, DL
, Ptr
, VecTy
->getElementType(),
1635 Offset
, TargetTy
, Indices
, NamePrefix
);
1638 if (ArrayType
*ArrTy
= dyn_cast
<ArrayType
>(Ty
)) {
1639 Type
*ElementTy
= ArrTy
->getElementType();
1640 APInt
ElementSize(Offset
.getBitWidth(), DL
.getTypeAllocSize(ElementTy
));
1641 APInt NumSkippedElements
= Offset
.sdiv(ElementSize
);
1642 if (NumSkippedElements
.ugt(ArrTy
->getNumElements()))
1645 Offset
-= NumSkippedElements
* ElementSize
;
1646 Indices
.push_back(IRB
.getInt(NumSkippedElements
));
1647 return getNaturalGEPRecursively(IRB
, DL
, Ptr
, ElementTy
, Offset
, TargetTy
,
1648 Indices
, NamePrefix
);
1651 StructType
*STy
= dyn_cast
<StructType
>(Ty
);
1655 const StructLayout
*SL
= DL
.getStructLayout(STy
);
1656 uint64_t StructOffset
= Offset
.getZExtValue();
1657 if (StructOffset
>= SL
->getSizeInBytes())
1659 unsigned Index
= SL
->getElementContainingOffset(StructOffset
);
1660 Offset
-= APInt(Offset
.getBitWidth(), SL
->getElementOffset(Index
));
1661 Type
*ElementTy
= STy
->getElementType(Index
);
1662 if (Offset
.uge(DL
.getTypeAllocSize(ElementTy
)))
1663 return nullptr; // The offset points into alignment padding.
1665 Indices
.push_back(IRB
.getInt32(Index
));
1666 return getNaturalGEPRecursively(IRB
, DL
, Ptr
, ElementTy
, Offset
, TargetTy
,
1667 Indices
, NamePrefix
);
1670 /// \brief Get a natural GEP from a base pointer to a particular offset and
1671 /// resulting in a particular type.
1673 /// The goal is to produce a "natural" looking GEP that works with the existing
1674 /// composite types to arrive at the appropriate offset and element type for
1675 /// a pointer. TargetTy is the element type the returned GEP should point-to if
1676 /// possible. We recurse by decreasing Offset, adding the appropriate index to
1677 /// Indices, and setting Ty to the result subtype.
1679 /// If no natural GEP can be constructed, this function returns null.
1680 static Value
*getNaturalGEPWithOffset(IRBuilderTy
&IRB
, const DataLayout
&DL
,
1681 Value
*Ptr
, APInt Offset
, Type
*TargetTy
,
1682 SmallVectorImpl
<Value
*> &Indices
,
1684 PointerType
*Ty
= cast
<PointerType
>(Ptr
->getType());
1686 // Don't consider any GEPs through an i8* as natural unless the TargetTy is
1688 if (Ty
== IRB
.getInt8PtrTy(Ty
->getAddressSpace()) && TargetTy
->isIntegerTy(8))
1691 Type
*ElementTy
= Ty
->getElementType();
1692 if (!ElementTy
->isSized())
1693 return nullptr; // We can't GEP through an unsized element.
1694 APInt
ElementSize(Offset
.getBitWidth(), DL
.getTypeAllocSize(ElementTy
));
1695 if (ElementSize
== 0)
1696 return nullptr; // Zero-length arrays can't help us build a natural GEP.
1697 APInt NumSkippedElements
= Offset
.sdiv(ElementSize
);
1699 Offset
-= NumSkippedElements
* ElementSize
;
1700 Indices
.push_back(IRB
.getInt(NumSkippedElements
));
1701 return getNaturalGEPRecursively(IRB
, DL
, Ptr
, ElementTy
, Offset
, TargetTy
,
1702 Indices
, NamePrefix
);
1705 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
1706 /// resulting pointer has PointerTy.
1708 /// This tries very hard to compute a "natural" GEP which arrives at the offset
1709 /// and produces the pointer type desired. Where it cannot, it will try to use
1710 /// the natural GEP to arrive at the offset and bitcast to the type. Where that
1711 /// fails, it will try to use an existing i8* and GEP to the byte offset and
1712 /// bitcast to the type.
1714 /// The strategy for finding the more natural GEPs is to peel off layers of the
1715 /// pointer, walking back through bit casts and GEPs, searching for a base
1716 /// pointer from which we can compute a natural GEP with the desired
1717 /// properties. The algorithm tries to fold as many constant indices into
1718 /// a single GEP as possible, thus making each GEP more independent of the
1719 /// surrounding code.
1720 static Value
*getAdjustedPtr(IRBuilderTy
&IRB
, const DataLayout
&DL
, Value
*Ptr
,
1721 APInt Offset
, Type
*PointerTy
, Twine NamePrefix
) {
1722 // Even though we don't look through PHI nodes, we could be called on an
1723 // instruction in an unreachable block, which may be on a cycle.
1724 SmallPtrSet
<Value
*, 4> Visited
;
1725 Visited
.insert(Ptr
);
1726 SmallVector
<Value
*, 4> Indices
;
1728 // We may end up computing an offset pointer that has the wrong type. If we
1729 // never are able to compute one directly that has the correct type, we'll
1730 // fall back to it, so keep it and the base it was computed from around here.
1731 Value
*OffsetPtr
= nullptr;
1732 Value
*OffsetBasePtr
;
1734 // Remember any i8 pointer we come across to re-use if we need to do a raw
1736 Value
*Int8Ptr
= nullptr;
1737 APInt
Int8PtrOffset(Offset
.getBitWidth(), 0);
1739 Type
*TargetTy
= PointerTy
->getPointerElementType();
1742 // First fold any existing GEPs into the offset.
1743 while (GEPOperator
*GEP
= dyn_cast
<GEPOperator
>(Ptr
)) {
1744 APInt
GEPOffset(Offset
.getBitWidth(), 0);
1745 if (!GEP
->accumulateConstantOffset(DL
, GEPOffset
))
1747 Offset
+= GEPOffset
;
1748 Ptr
= GEP
->getPointerOperand();
1749 if (!Visited
.insert(Ptr
).second
)
1753 // See if we can perform a natural GEP here.
1755 if (Value
*P
= getNaturalGEPWithOffset(IRB
, DL
, Ptr
, Offset
, TargetTy
,
1756 Indices
, NamePrefix
)) {
1757 // If we have a new natural pointer at the offset, clear out any old
1758 // offset pointer we computed. Unless it is the base pointer or
1759 // a non-instruction, we built a GEP we don't need. Zap it.
1760 if (OffsetPtr
&& OffsetPtr
!= OffsetBasePtr
)
1761 if (Instruction
*I
= dyn_cast
<Instruction
>(OffsetPtr
)) {
1762 assert(I
->use_empty() && "Built a GEP with uses some how!");
1763 I
->eraseFromParent();
1766 OffsetBasePtr
= Ptr
;
1767 // If we also found a pointer of the right type, we're done.
1768 if (P
->getType() == PointerTy
)
1772 // Stash this pointer if we've found an i8*.
1773 if (Ptr
->getType()->isIntegerTy(8)) {
1775 Int8PtrOffset
= Offset
;
1778 // Peel off a layer of the pointer and update the offset appropriately.
1779 if (Operator::getOpcode(Ptr
) == Instruction::BitCast
) {
1780 Ptr
= cast
<Operator
>(Ptr
)->getOperand(0);
1781 } else if (GlobalAlias
*GA
= dyn_cast
<GlobalAlias
>(Ptr
)) {
1782 if (GA
->mayBeOverridden())
1784 Ptr
= GA
->getAliasee();
1788 assert(Ptr
->getType()->isPointerTy() && "Unexpected operand type!");
1789 } while (Visited
.insert(Ptr
).second
);
1793 Int8Ptr
= IRB
.CreateBitCast(
1794 Ptr
, IRB
.getInt8PtrTy(PointerTy
->getPointerAddressSpace()),
1795 NamePrefix
+ "sroa_raw_cast");
1796 Int8PtrOffset
= Offset
;
1799 OffsetPtr
= Int8PtrOffset
== 0
1801 : IRB
.CreateInBoundsGEP(Int8Ptr
, IRB
.getInt(Int8PtrOffset
),
1802 NamePrefix
+ "sroa_raw_idx");
1806 // On the off chance we were targeting i8*, guard the bitcast here.
1807 if (Ptr
->getType() != PointerTy
)
1808 Ptr
= IRB
.CreateBitCast(Ptr
, PointerTy
, NamePrefix
+ "sroa_cast");
1813 /// \brief Compute the adjusted alignment for a load or store from an offset.
1814 static unsigned getAdjustedAlignment(Instruction
*I
, uint64_t Offset
,
1815 const DataLayout
&DL
) {
1818 if (auto *LI
= dyn_cast
<LoadInst
>(I
)) {
1819 Alignment
= LI
->getAlignment();
1821 } else if (auto *SI
= dyn_cast
<StoreInst
>(I
)) {
1822 Alignment
= SI
->getAlignment();
1823 Ty
= SI
->getValueOperand()->getType();
1825 llvm_unreachable("Only loads and stores are allowed!");
1829 Alignment
= DL
.getABITypeAlignment(Ty
);
1831 return MinAlign(Alignment
, Offset
);
1834 /// \brief Test whether we can convert a value from the old to the new type.
1836 /// This predicate should be used to guard calls to convertValue in order to
1837 /// ensure that we only try to convert viable values. The strategy is that we
1838 /// will peel off single element struct and array wrappings to get to an
1839 /// underlying value, and convert that value.
1840 static bool canConvertValue(const DataLayout
&DL
, Type
*OldTy
, Type
*NewTy
) {
1843 if (IntegerType
*OldITy
= dyn_cast
<IntegerType
>(OldTy
))
1844 if (IntegerType
*NewITy
= dyn_cast
<IntegerType
>(NewTy
))
1845 if (NewITy
->getBitWidth() >= OldITy
->getBitWidth())
1847 if (DL
.getTypeSizeInBits(NewTy
) != DL
.getTypeSizeInBits(OldTy
))
1849 if (!NewTy
->isSingleValueType() || !OldTy
->isSingleValueType())
1852 // We can convert pointers to integers and vice-versa. Same for vectors
1853 // of pointers and integers.
1854 OldTy
= OldTy
->getScalarType();
1855 NewTy
= NewTy
->getScalarType();
1856 if (NewTy
->isPointerTy() || OldTy
->isPointerTy()) {
1857 if (NewTy
->isPointerTy() && OldTy
->isPointerTy())
1859 if (NewTy
->isIntegerTy() || OldTy
->isIntegerTy())
1867 /// \brief Generic routine to convert an SSA value to a value of a different
1870 /// This will try various different casting techniques, such as bitcasts,
1871 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
1872 /// two types for viability with this routine.
1873 static Value
*convertValue(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*V
,
1875 Type
*OldTy
= V
->getType();
1876 assert(canConvertValue(DL
, OldTy
, NewTy
) && "Value not convertable to type");
1881 if (IntegerType
*OldITy
= dyn_cast
<IntegerType
>(OldTy
))
1882 if (IntegerType
*NewITy
= dyn_cast
<IntegerType
>(NewTy
))
1883 if (NewITy
->getBitWidth() > OldITy
->getBitWidth())
1884 return IRB
.CreateZExt(V
, NewITy
);
1886 // See if we need inttoptr for this type pair. A cast involving both scalars
1887 // and vectors requires and additional bitcast.
1888 if (OldTy
->getScalarType()->isIntegerTy() &&
1889 NewTy
->getScalarType()->isPointerTy()) {
1890 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
1891 if (OldTy
->isVectorTy() && !NewTy
->isVectorTy())
1892 return IRB
.CreateIntToPtr(IRB
.CreateBitCast(V
, DL
.getIntPtrType(NewTy
)),
1895 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
1896 if (!OldTy
->isVectorTy() && NewTy
->isVectorTy())
1897 return IRB
.CreateIntToPtr(IRB
.CreateBitCast(V
, DL
.getIntPtrType(NewTy
)),
1900 return IRB
.CreateIntToPtr(V
, NewTy
);
1903 // See if we need ptrtoint for this type pair. A cast involving both scalars
1904 // and vectors requires and additional bitcast.
1905 if (OldTy
->getScalarType()->isPointerTy() &&
1906 NewTy
->getScalarType()->isIntegerTy()) {
1907 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
1908 if (OldTy
->isVectorTy() && !NewTy
->isVectorTy())
1909 return IRB
.CreateBitCast(IRB
.CreatePtrToInt(V
, DL
.getIntPtrType(OldTy
)),
1912 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
1913 if (!OldTy
->isVectorTy() && NewTy
->isVectorTy())
1914 return IRB
.CreateBitCast(IRB
.CreatePtrToInt(V
, DL
.getIntPtrType(OldTy
)),
1917 return IRB
.CreatePtrToInt(V
, NewTy
);
1920 return IRB
.CreateBitCast(V
, NewTy
);
1923 /// \brief Test whether the given slice use can be promoted to a vector.
1925 /// This function is called to test each entry in a partioning which is slated
1926 /// for a single slice.
1927 static bool isVectorPromotionViableForSlice(AllocaSlices::Partition
&P
,
1928 const Slice
&S
, VectorType
*Ty
,
1929 uint64_t ElementSize
,
1930 const DataLayout
&DL
) {
1931 // First validate the slice offsets.
1932 uint64_t BeginOffset
=
1933 std::max(S
.beginOffset(), P
.beginOffset()) - P
.beginOffset();
1934 uint64_t BeginIndex
= BeginOffset
/ ElementSize
;
1935 if (BeginIndex
* ElementSize
!= BeginOffset
||
1936 BeginIndex
>= Ty
->getNumElements())
1938 uint64_t EndOffset
=
1939 std::min(S
.endOffset(), P
.endOffset()) - P
.beginOffset();
1940 uint64_t EndIndex
= EndOffset
/ ElementSize
;
1941 if (EndIndex
* ElementSize
!= EndOffset
|| EndIndex
> Ty
->getNumElements())
1944 assert(EndIndex
> BeginIndex
&& "Empty vector!");
1945 uint64_t NumElements
= EndIndex
- BeginIndex
;
1946 Type
*SliceTy
= (NumElements
== 1)
1947 ? Ty
->getElementType()
1948 : VectorType::get(Ty
->getElementType(), NumElements
);
1951 Type::getIntNTy(Ty
->getContext(), NumElements
* ElementSize
* 8);
1953 Use
*U
= S
.getUse();
1955 if (MemIntrinsic
*MI
= dyn_cast
<MemIntrinsic
>(U
->getUser())) {
1956 if (MI
->isVolatile())
1958 if (!S
.isSplittable())
1959 return false; // Skip any unsplittable intrinsics.
1960 } else if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(U
->getUser())) {
1961 if (II
->getIntrinsicID() != Intrinsic::lifetime_start
&&
1962 II
->getIntrinsicID() != Intrinsic::lifetime_end
)
1964 } else if (U
->get()->getType()->getPointerElementType()->isStructTy()) {
1965 // Disable vector promotion when there are loads or stores of an FCA.
1967 } else if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
1968 if (LI
->isVolatile())
1970 Type
*LTy
= LI
->getType();
1971 if (P
.beginOffset() > S
.beginOffset() || P
.endOffset() < S
.endOffset()) {
1972 assert(LTy
->isIntegerTy());
1975 if (!canConvertValue(DL
, SliceTy
, LTy
))
1977 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
1978 if (SI
->isVolatile())
1980 Type
*STy
= SI
->getValueOperand()->getType();
1981 if (P
.beginOffset() > S
.beginOffset() || P
.endOffset() < S
.endOffset()) {
1982 assert(STy
->isIntegerTy());
1985 if (!canConvertValue(DL
, STy
, SliceTy
))
1994 /// \brief Test whether the given alloca partitioning and range of slices can be
1995 /// promoted to a vector.
1997 /// This is a quick test to check whether we can rewrite a particular alloca
1998 /// partition (and its newly formed alloca) into a vector alloca with only
1999 /// whole-vector loads and stores such that it could be promoted to a vector
2000 /// SSA value. We only can ensure this for a limited set of operations, and we
2001 /// don't want to do the rewrites unless we are confident that the result will
2002 /// be promotable, so we have an early test here.
2003 static VectorType
*isVectorPromotionViable(AllocaSlices::Partition
&P
,
2004 const DataLayout
&DL
) {
2005 // Collect the candidate types for vector-based promotion. Also track whether
2006 // we have different element types.
2007 SmallVector
<VectorType
*, 4> CandidateTys
;
2008 Type
*CommonEltTy
= nullptr;
2009 bool HaveCommonEltTy
= true;
2010 auto CheckCandidateType
= [&](Type
*Ty
) {
2011 if (auto *VTy
= dyn_cast
<VectorType
>(Ty
)) {
2012 CandidateTys
.push_back(VTy
);
2014 CommonEltTy
= VTy
->getElementType();
2015 else if (CommonEltTy
!= VTy
->getElementType())
2016 HaveCommonEltTy
= false;
2019 // Consider any loads or stores that are the exact size of the slice.
2020 for (const Slice
&S
: P
)
2021 if (S
.beginOffset() == P
.beginOffset() &&
2022 S
.endOffset() == P
.endOffset()) {
2023 if (auto *LI
= dyn_cast
<LoadInst
>(S
.getUse()->getUser()))
2024 CheckCandidateType(LI
->getType());
2025 else if (auto *SI
= dyn_cast
<StoreInst
>(S
.getUse()->getUser()))
2026 CheckCandidateType(SI
->getValueOperand()->getType());
2029 // If we didn't find a vector type, nothing to do here.
2030 if (CandidateTys
.empty())
2033 // Remove non-integer vector types if we had multiple common element types.
2034 // FIXME: It'd be nice to replace them with integer vector types, but we can't
2035 // do that until all the backends are known to produce good code for all
2036 // integer vector types.
2037 if (!HaveCommonEltTy
) {
2038 CandidateTys
.erase(std::remove_if(CandidateTys
.begin(), CandidateTys
.end(),
2039 [](VectorType
*VTy
) {
2040 return !VTy
->getElementType()->isIntegerTy();
2042 CandidateTys
.end());
2044 // If there were no integer vector types, give up.
2045 if (CandidateTys
.empty())
2048 // Rank the remaining candidate vector types. This is easy because we know
2049 // they're all integer vectors. We sort by ascending number of elements.
2050 auto RankVectorTypes
= [&DL
](VectorType
*RHSTy
, VectorType
*LHSTy
) {
2051 assert(DL
.getTypeSizeInBits(RHSTy
) == DL
.getTypeSizeInBits(LHSTy
) &&
2052 "Cannot have vector types of different sizes!");
2053 assert(RHSTy
->getElementType()->isIntegerTy() &&
2054 "All non-integer types eliminated!");
2055 assert(LHSTy
->getElementType()->isIntegerTy() &&
2056 "All non-integer types eliminated!");
2057 return RHSTy
->getNumElements() < LHSTy
->getNumElements();
2059 std::sort(CandidateTys
.begin(), CandidateTys
.end(), RankVectorTypes
);
2061 std::unique(CandidateTys
.begin(), CandidateTys
.end(), RankVectorTypes
),
2062 CandidateTys
.end());
2064 // The only way to have the same element type in every vector type is to
2065 // have the same vector type. Check that and remove all but one.
2067 for (VectorType
*VTy
: CandidateTys
) {
2068 assert(VTy
->getElementType() == CommonEltTy
&&
2069 "Unaccounted for element type!");
2070 assert(VTy
== CandidateTys
[0] &&
2071 "Different vector types with the same element type!");
2074 CandidateTys
.resize(1);
2077 // Try each vector type, and return the one which works.
2078 auto CheckVectorTypeForPromotion
= [&](VectorType
*VTy
) {
2079 uint64_t ElementSize
= DL
.getTypeSizeInBits(VTy
->getElementType());
2081 // While the definition of LLVM vectors is bitpacked, we don't support sizes
2082 // that aren't byte sized.
2083 if (ElementSize
% 8)
2085 assert((DL
.getTypeSizeInBits(VTy
) % 8) == 0 &&
2086 "vector size not a multiple of element size?");
2089 for (const Slice
&S
: P
)
2090 if (!isVectorPromotionViableForSlice(P
, S
, VTy
, ElementSize
, DL
))
2093 for (const Slice
*S
: P
.splitSliceTails())
2094 if (!isVectorPromotionViableForSlice(P
, *S
, VTy
, ElementSize
, DL
))
2099 for (VectorType
*VTy
: CandidateTys
)
2100 if (CheckVectorTypeForPromotion(VTy
))
2106 /// \brief Test whether a slice of an alloca is valid for integer widening.
2108 /// This implements the necessary checking for the \c isIntegerWideningViable
2109 /// test below on a single slice of the alloca.
2110 static bool isIntegerWideningViableForSlice(const Slice
&S
,
2111 uint64_t AllocBeginOffset
,
2113 const DataLayout
&DL
,
2114 bool &WholeAllocaOp
) {
2115 uint64_t Size
= DL
.getTypeStoreSize(AllocaTy
);
2117 uint64_t RelBegin
= S
.beginOffset() - AllocBeginOffset
;
2118 uint64_t RelEnd
= S
.endOffset() - AllocBeginOffset
;
2120 // We can't reasonably handle cases where the load or store extends past
2121 // the end of the aloca's type and into its padding.
2125 Use
*U
= S
.getUse();
2127 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(U
->getUser())) {
2128 if (LI
->isVolatile())
2130 // Note that we don't count vector loads or stores as whole-alloca
2131 // operations which enable integer widening because we would prefer to use
2132 // vector widening instead.
2133 if (!isa
<VectorType
>(LI
->getType()) && RelBegin
== 0 && RelEnd
== Size
)
2134 WholeAllocaOp
= true;
2135 if (IntegerType
*ITy
= dyn_cast
<IntegerType
>(LI
->getType())) {
2136 if (ITy
->getBitWidth() < DL
.getTypeStoreSizeInBits(ITy
))
2138 } else if (RelBegin
!= 0 || RelEnd
!= Size
||
2139 !canConvertValue(DL
, AllocaTy
, LI
->getType())) {
2140 // Non-integer loads need to be convertible from the alloca type so that
2141 // they are promotable.
2144 } else if (StoreInst
*SI
= dyn_cast
<StoreInst
>(U
->getUser())) {
2145 Type
*ValueTy
= SI
->getValueOperand()->getType();
2146 if (SI
->isVolatile())
2148 // Note that we don't count vector loads or stores as whole-alloca
2149 // operations which enable integer widening because we would prefer to use
2150 // vector widening instead.
2151 if (!isa
<VectorType
>(ValueTy
) && RelBegin
== 0 && RelEnd
== Size
)
2152 WholeAllocaOp
= true;
2153 if (IntegerType
*ITy
= dyn_cast
<IntegerType
>(ValueTy
)) {
2154 if (ITy
->getBitWidth() < DL
.getTypeStoreSizeInBits(ITy
))
2156 } else if (RelBegin
!= 0 || RelEnd
!= Size
||
2157 !canConvertValue(DL
, ValueTy
, AllocaTy
)) {
2158 // Non-integer stores need to be convertible to the alloca type so that
2159 // they are promotable.
2162 } else if (MemIntrinsic
*MI
= dyn_cast
<MemIntrinsic
>(U
->getUser())) {
2163 if (MI
->isVolatile() || !isa
<Constant
>(MI
->getLength()))
2165 if (!S
.isSplittable())
2166 return false; // Skip any unsplittable intrinsics.
2167 } else if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(U
->getUser())) {
2168 if (II
->getIntrinsicID() != Intrinsic::lifetime_start
&&
2169 II
->getIntrinsicID() != Intrinsic::lifetime_end
)
2178 /// \brief Test whether the given alloca partition's integer operations can be
2179 /// widened to promotable ones.
2181 /// This is a quick test to check whether we can rewrite the integer loads and
2182 /// stores to a particular alloca into wider loads and stores and be able to
2183 /// promote the resulting alloca.
2184 static bool isIntegerWideningViable(AllocaSlices::Partition
&P
, Type
*AllocaTy
,
2185 const DataLayout
&DL
) {
2186 uint64_t SizeInBits
= DL
.getTypeSizeInBits(AllocaTy
);
2187 // Don't create integer types larger than the maximum bitwidth.
2188 if (SizeInBits
> IntegerType::MAX_INT_BITS
)
2191 // Don't try to handle allocas with bit-padding.
2192 if (SizeInBits
!= DL
.getTypeStoreSizeInBits(AllocaTy
))
2195 // We need to ensure that an integer type with the appropriate bitwidth can
2196 // be converted to the alloca type, whatever that is. We don't want to force
2197 // the alloca itself to have an integer type if there is a more suitable one.
2198 Type
*IntTy
= Type::getIntNTy(AllocaTy
->getContext(), SizeInBits
);
2199 if (!canConvertValue(DL
, AllocaTy
, IntTy
) ||
2200 !canConvertValue(DL
, IntTy
, AllocaTy
))
2203 // While examining uses, we ensure that the alloca has a covering load or
2204 // store. We don't want to widen the integer operations only to fail to
2205 // promote due to some other unsplittable entry (which we may make splittable
2206 // later). However, if there are only splittable uses, go ahead and assume
2207 // that we cover the alloca.
2208 // FIXME: We shouldn't consider split slices that happen to start in the
2209 // partition here...
2210 bool WholeAllocaOp
=
2211 P
.begin() != P
.end() ? false : DL
.isLegalInteger(SizeInBits
);
2213 for (const Slice
&S
: P
)
2214 if (!isIntegerWideningViableForSlice(S
, P
.beginOffset(), AllocaTy
, DL
,
2218 for (const Slice
*S
: P
.splitSliceTails())
2219 if (!isIntegerWideningViableForSlice(*S
, P
.beginOffset(), AllocaTy
, DL
,
2223 return WholeAllocaOp
;
2226 static Value
*extractInteger(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*V
,
2227 IntegerType
*Ty
, uint64_t Offset
,
2228 const Twine
&Name
) {
2229 DEBUG(dbgs() << " start: " << *V
<< "\n");
2230 IntegerType
*IntTy
= cast
<IntegerType
>(V
->getType());
2231 assert(DL
.getTypeStoreSize(Ty
) + Offset
<= DL
.getTypeStoreSize(IntTy
) &&
2232 "Element extends past full value");
2233 uint64_t ShAmt
= 8 * Offset
;
2234 if (DL
.isBigEndian())
2235 ShAmt
= 8 * (DL
.getTypeStoreSize(IntTy
) - DL
.getTypeStoreSize(Ty
) - Offset
);
2237 V
= IRB
.CreateLShr(V
, ShAmt
, Name
+ ".shift");
2238 DEBUG(dbgs() << " shifted: " << *V
<< "\n");
2240 assert(Ty
->getBitWidth() <= IntTy
->getBitWidth() &&
2241 "Cannot extract to a larger integer!");
2243 V
= IRB
.CreateTrunc(V
, Ty
, Name
+ ".trunc");
2244 DEBUG(dbgs() << " trunced: " << *V
<< "\n");
2249 static Value
*insertInteger(const DataLayout
&DL
, IRBuilderTy
&IRB
, Value
*Old
,
2250 Value
*V
, uint64_t Offset
, const Twine
&Name
) {
2251 IntegerType
*IntTy
= cast
<IntegerType
>(Old
->getType());
2252 IntegerType
*Ty
= cast
<IntegerType
>(V
->getType());
2253 assert(Ty
->getBitWidth() <= IntTy
->getBitWidth() &&
2254 "Cannot insert a larger integer!");
2255 DEBUG(dbgs() << " start: " << *V
<< "\n");
2257 V
= IRB
.CreateZExt(V
, IntTy
, Name
+ ".ext");
2258 DEBUG(dbgs() << " extended: " << *V
<< "\n");
2260 assert(DL
.getTypeStoreSize(Ty
) + Offset
<= DL
.getTypeStoreSize(IntTy
) &&
2261 "Element store outside of alloca store");
2262 uint64_t ShAmt
= 8 * Offset
;
2263 if (DL
.isBigEndian())
2264 ShAmt
= 8 * (DL
.getTypeStoreSize(IntTy
) - DL
.getTypeStoreSize(Ty
) - Offset
);
2266 V
= IRB
.CreateShl(V
, ShAmt
, Name
+ ".shift");
2267 DEBUG(dbgs() << " shifted: " << *V
<< "\n");
2270 if (ShAmt
|| Ty
->getBitWidth() < IntTy
->getBitWidth()) {
2271 APInt Mask
= ~Ty
->getMask().zext(IntTy
->getBitWidth()).shl(ShAmt
);
2272 Old
= IRB
.CreateAnd(Old
, Mask
, Name
+ ".mask");
2273 DEBUG(dbgs() << " masked: " << *Old
<< "\n");
2274 V
= IRB
.CreateOr(Old
, V
, Name
+ ".insert");
2275 DEBUG(dbgs() << " inserted: " << *V
<< "\n");
2280 static Value
*extractVector(IRBuilderTy
&IRB
, Value
*V
, unsigned BeginIndex
,
2281 unsigned EndIndex
, const Twine
&Name
) {
2282 VectorType
*VecTy
= cast
<VectorType
>(V
->getType());
2283 unsigned NumElements
= EndIndex
- BeginIndex
;
2284 assert(NumElements
<= VecTy
->getNumElements() && "Too many elements!");
2286 if (NumElements
== VecTy
->getNumElements())
2289 if (NumElements
== 1) {
2290 V
= IRB
.CreateExtractElement(V
, IRB
.getInt32(BeginIndex
),
2292 DEBUG(dbgs() << " extract: " << *V
<< "\n");
2296 SmallVector
<Constant
*, 8> Mask
;
2297 Mask
.reserve(NumElements
);
2298 for (unsigned i
= BeginIndex
; i
!= EndIndex
; ++i
)
2299 Mask
.push_back(IRB
.getInt32(i
));
2300 V
= IRB
.CreateShuffleVector(V
, UndefValue::get(V
->getType()),
2301 ConstantVector::get(Mask
), Name
+ ".extract");
2302 DEBUG(dbgs() << " shuffle: " << *V
<< "\n");
2306 static Value
*insertVector(IRBuilderTy
&IRB
, Value
*Old
, Value
*V
,
2307 unsigned BeginIndex
, const Twine
&Name
) {
2308 VectorType
*VecTy
= cast
<VectorType
>(Old
->getType());
2309 assert(VecTy
&& "Can only insert a vector into a vector");
2311 VectorType
*Ty
= dyn_cast
<VectorType
>(V
->getType());
2313 // Single element to insert.
2314 V
= IRB
.CreateInsertElement(Old
, V
, IRB
.getInt32(BeginIndex
),
2316 DEBUG(dbgs() << " insert: " << *V
<< "\n");
2320 assert(Ty
->getNumElements() <= VecTy
->getNumElements() &&
2321 "Too many elements!");
2322 if (Ty
->getNumElements() == VecTy
->getNumElements()) {
2323 assert(V
->getType() == VecTy
&& "Vector type mismatch");
2326 unsigned EndIndex
= BeginIndex
+ Ty
->getNumElements();
2328 // When inserting a smaller vector into the larger to store, we first
2329 // use a shuffle vector to widen it with undef elements, and then
2330 // a second shuffle vector to select between the loaded vector and the
2332 SmallVector
<Constant
*, 8> Mask
;
2333 Mask
.reserve(VecTy
->getNumElements());
2334 for (unsigned i
= 0; i
!= VecTy
->getNumElements(); ++i
)
2335 if (i
>= BeginIndex
&& i
< EndIndex
)
2336 Mask
.push_back(IRB
.getInt32(i
- BeginIndex
));
2338 Mask
.push_back(UndefValue::get(IRB
.getInt32Ty()));
2339 V
= IRB
.CreateShuffleVector(V
, UndefValue::get(V
->getType()),
2340 ConstantVector::get(Mask
), Name
+ ".expand");
2341 DEBUG(dbgs() << " shuffle: " << *V
<< "\n");
2344 for (unsigned i
= 0; i
!= VecTy
->getNumElements(); ++i
)
2345 Mask
.push_back(IRB
.getInt1(i
>= BeginIndex
&& i
< EndIndex
));
2347 V
= IRB
.CreateSelect(ConstantVector::get(Mask
), V
, Old
, Name
+ "blend");
2349 DEBUG(dbgs() << " blend: " << *V
<< "\n");
2354 /// \brief Visitor to rewrite instructions using p particular slice of an alloca
2355 /// to use a new alloca.
2357 /// Also implements the rewriting to vector-based accesses when the partition
2358 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2360 class AllocaSliceRewriter
: public InstVisitor
<AllocaSliceRewriter
, bool> {
2361 // Befriend the base class so it can delegate to private visit methods.
2362 friend class llvm::InstVisitor
<AllocaSliceRewriter
, bool>;
2363 typedef llvm::InstVisitor
<AllocaSliceRewriter
, bool> Base
;
2365 const DataLayout
&DL
;
2368 AllocaInst
&OldAI
, &NewAI
;
2369 const uint64_t NewAllocaBeginOffset
, NewAllocaEndOffset
;
2372 // This is a convenience and flag variable that will be null unless the new
2373 // alloca's integer operations should be widened to this integer type due to
2374 // passing isIntegerWideningViable above. If it is non-null, the desired
2375 // integer type will be stored here for easy access during rewriting.
2378 // If we are rewriting an alloca partition which can be written as pure
2379 // vector operations, we stash extra information here. When VecTy is
2380 // non-null, we have some strict guarantees about the rewritten alloca:
2381 // - The new alloca is exactly the size of the vector type here.
2382 // - The accesses all either map to the entire vector or to a single
2384 // - The set of accessing instructions is only one of those handled above
2385 // in isVectorPromotionViable. Generally these are the same access kinds
2386 // which are promotable via mem2reg.
2389 uint64_t ElementSize
;
2391 // The original offset of the slice currently being rewritten relative to
2392 // the original alloca.
2393 uint64_t BeginOffset
, EndOffset
;
2394 // The new offsets of the slice currently being rewritten relative to the
2396 uint64_t NewBeginOffset
, NewEndOffset
;
2402 Instruction
*OldPtr
;
2404 // Track post-rewrite users which are PHI nodes and Selects.
2405 SmallPtrSetImpl
<PHINode
*> &PHIUsers
;
2406 SmallPtrSetImpl
<SelectInst
*> &SelectUsers
;
2408 // Utility IR builder, whose name prefix is setup for each visited use, and
2409 // the insertion point is set to point to the user.
2413 AllocaSliceRewriter(const DataLayout
&DL
, AllocaSlices
&AS
, SROA
&Pass
,
2414 AllocaInst
&OldAI
, AllocaInst
&NewAI
,
2415 uint64_t NewAllocaBeginOffset
,
2416 uint64_t NewAllocaEndOffset
, bool IsIntegerPromotable
,
2417 VectorType
*PromotableVecTy
,
2418 SmallPtrSetImpl
<PHINode
*> &PHIUsers
,
2419 SmallPtrSetImpl
<SelectInst
*> &SelectUsers
)
2420 : DL(DL
), AS(AS
), Pass(Pass
), OldAI(OldAI
), NewAI(NewAI
),
2421 NewAllocaBeginOffset(NewAllocaBeginOffset
),
2422 NewAllocaEndOffset(NewAllocaEndOffset
),
2423 NewAllocaTy(NewAI
.getAllocatedType()),
2424 IntTy(IsIntegerPromotable
2427 DL
.getTypeSizeInBits(NewAI
.getAllocatedType()))
2429 VecTy(PromotableVecTy
),
2430 ElementTy(VecTy
? VecTy
->getElementType() : nullptr),
2431 ElementSize(VecTy
? DL
.getTypeSizeInBits(ElementTy
) / 8 : 0),
2432 BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
2433 OldPtr(), PHIUsers(PHIUsers
), SelectUsers(SelectUsers
),
2434 IRB(NewAI
.getContext(), ConstantFolder()) {
2436 assert((DL
.getTypeSizeInBits(ElementTy
) % 8) == 0 &&
2437 "Only multiple-of-8 sized vector elements are viable");
2440 assert((!IntTy
&& !VecTy
) || (IntTy
&& !VecTy
) || (!IntTy
&& VecTy
));
2443 bool visit(AllocaSlices::const_iterator I
) {
2444 bool CanSROA
= true;
2445 BeginOffset
= I
->beginOffset();
2446 EndOffset
= I
->endOffset();
2447 IsSplittable
= I
->isSplittable();
2449 BeginOffset
< NewAllocaBeginOffset
|| EndOffset
> NewAllocaEndOffset
;
2450 DEBUG(dbgs() << " rewriting " << (IsSplit
? "split " : ""));
2451 DEBUG(AS
.printSlice(dbgs(), I
, ""));
2452 DEBUG(dbgs() << "\n");
2454 // Compute the intersecting offset range.
2455 assert(BeginOffset
< NewAllocaEndOffset
);
2456 assert(EndOffset
> NewAllocaBeginOffset
);
2457 NewBeginOffset
= std::max(BeginOffset
, NewAllocaBeginOffset
);
2458 NewEndOffset
= std::min(EndOffset
, NewAllocaEndOffset
);
2460 SliceSize
= NewEndOffset
- NewBeginOffset
;
2462 OldUse
= I
->getUse();
2463 OldPtr
= cast
<Instruction
>(OldUse
->get());
2465 Instruction
*OldUserI
= cast
<Instruction
>(OldUse
->getUser());
2466 IRB
.SetInsertPoint(OldUserI
);
2467 IRB
.SetCurrentDebugLocation(OldUserI
->getDebugLoc());
2468 IRB
.SetNamePrefix(Twine(NewAI
.getName()) + "." + Twine(BeginOffset
) + ".");
2470 CanSROA
&= visit(cast
<Instruction
>(OldUse
->getUser()));
2477 // Make sure the other visit overloads are visible.
2480 // Every instruction which can end up as a user must have a rewrite rule.
2481 bool visitInstruction(Instruction
&I
) {
2482 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I
<< "\n");
2483 llvm_unreachable("No rewrite rule for this instruction!");
2486 Value
*getNewAllocaSlicePtr(IRBuilderTy
&IRB
, Type
*PointerTy
) {
2487 // Note that the offset computation can use BeginOffset or NewBeginOffset
2488 // interchangeably for unsplit slices.
2489 assert(IsSplit
|| BeginOffset
== NewBeginOffset
);
2490 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
2493 StringRef OldName
= OldPtr
->getName();
2494 // Skip through the last '.sroa.' component of the name.
2495 size_t LastSROAPrefix
= OldName
.rfind(".sroa.");
2496 if (LastSROAPrefix
!= StringRef::npos
) {
2497 OldName
= OldName
.substr(LastSROAPrefix
+ strlen(".sroa."));
2498 // Look for an SROA slice index.
2499 size_t IndexEnd
= OldName
.find_first_not_of("0123456789");
2500 if (IndexEnd
!= StringRef::npos
&& OldName
[IndexEnd
] == '.') {
2501 // Strip the index and look for the offset.
2502 OldName
= OldName
.substr(IndexEnd
+ 1);
2503 size_t OffsetEnd
= OldName
.find_first_not_of("0123456789");
2504 if (OffsetEnd
!= StringRef::npos
&& OldName
[OffsetEnd
] == '.')
2505 // Strip the offset.
2506 OldName
= OldName
.substr(OffsetEnd
+ 1);
2509 // Strip any SROA suffixes as well.
2510 OldName
= OldName
.substr(0, OldName
.find(".sroa_"));
2513 return getAdjustedPtr(IRB
, DL
, &NewAI
,
2514 APInt(DL
.getPointerSizeInBits(), Offset
), PointerTy
,
2516 Twine(OldName
) + "."
2523 /// \brief Compute suitable alignment to access this slice of the *new*
2526 /// You can optionally pass a type to this routine and if that type's ABI
2527 /// alignment is itself suitable, this will return zero.
2528 unsigned getSliceAlign(Type
*Ty
= nullptr) {
2529 unsigned NewAIAlign
= NewAI
.getAlignment();
2531 NewAIAlign
= DL
.getABITypeAlignment(NewAI
.getAllocatedType());
2533 MinAlign(NewAIAlign
, NewBeginOffset
- NewAllocaBeginOffset
);
2534 return (Ty
&& Align
== DL
.getABITypeAlignment(Ty
)) ? 0 : Align
;
2537 unsigned getIndex(uint64_t Offset
) {
2538 assert(VecTy
&& "Can only call getIndex when rewriting a vector");
2539 uint64_t RelOffset
= Offset
- NewAllocaBeginOffset
;
2540 assert(RelOffset
/ ElementSize
< UINT32_MAX
&& "Index out of bounds");
2541 uint32_t Index
= RelOffset
/ ElementSize
;
2542 assert(Index
* ElementSize
== RelOffset
);
2546 void deleteIfTriviallyDead(Value
*V
) {
2547 Instruction
*I
= cast
<Instruction
>(V
);
2548 if (isInstructionTriviallyDead(I
))
2549 Pass
.DeadInsts
.insert(I
);
2552 Value
*rewriteVectorizedLoadInst() {
2553 unsigned BeginIndex
= getIndex(NewBeginOffset
);
2554 unsigned EndIndex
= getIndex(NewEndOffset
);
2555 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2557 Value
*V
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "load");
2558 return extractVector(IRB
, V
, BeginIndex
, EndIndex
, "vec");
2561 Value
*rewriteIntegerLoad(LoadInst
&LI
) {
2562 assert(IntTy
&& "We cannot insert an integer to the alloca");
2563 assert(!LI
.isVolatile());
2564 Value
*V
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "load");
2565 V
= convertValue(DL
, IRB
, V
, IntTy
);
2566 assert(NewBeginOffset
>= NewAllocaBeginOffset
&& "Out of bounds offset");
2567 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
2568 if (Offset
> 0 || NewEndOffset
< NewAllocaEndOffset
)
2569 V
= extractInteger(DL
, IRB
, V
, cast
<IntegerType
>(LI
.getType()), Offset
,
2574 bool visitLoadInst(LoadInst
&LI
) {
2575 DEBUG(dbgs() << " original: " << LI
<< "\n");
2576 Value
*OldOp
= LI
.getOperand(0);
2577 assert(OldOp
== OldPtr
);
2579 Type
*TargetTy
= IsSplit
? Type::getIntNTy(LI
.getContext(), SliceSize
* 8)
2581 bool IsPtrAdjusted
= false;
2584 V
= rewriteVectorizedLoadInst();
2585 } else if (IntTy
&& LI
.getType()->isIntegerTy()) {
2586 V
= rewriteIntegerLoad(LI
);
2587 } else if (NewBeginOffset
== NewAllocaBeginOffset
&&
2588 canConvertValue(DL
, NewAllocaTy
, LI
.getType())) {
2589 V
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), LI
.isVolatile(),
2592 Type
*LTy
= TargetTy
->getPointerTo();
2593 V
= IRB
.CreateAlignedLoad(getNewAllocaSlicePtr(IRB
, LTy
),
2594 getSliceAlign(TargetTy
), LI
.isVolatile(),
2596 IsPtrAdjusted
= true;
2598 V
= convertValue(DL
, IRB
, V
, TargetTy
);
2601 assert(!LI
.isVolatile());
2602 assert(LI
.getType()->isIntegerTy() &&
2603 "Only integer type loads and stores are split");
2604 assert(SliceSize
< DL
.getTypeStoreSize(LI
.getType()) &&
2605 "Split load isn't smaller than original load");
2606 assert(LI
.getType()->getIntegerBitWidth() ==
2607 DL
.getTypeStoreSizeInBits(LI
.getType()) &&
2608 "Non-byte-multiple bit width");
2609 // Move the insertion point just past the load so that we can refer to it.
2610 IRB
.SetInsertPoint(std::next(BasicBlock::iterator(&LI
)));
2611 // Create a placeholder value with the same type as LI to use as the
2612 // basis for the new value. This allows us to replace the uses of LI with
2613 // the computed value, and then replace the placeholder with LI, leaving
2614 // LI only used for this computation.
2615 Value
*Placeholder
=
2616 new LoadInst(UndefValue::get(LI
.getType()->getPointerTo()));
2617 V
= insertInteger(DL
, IRB
, Placeholder
, V
, NewBeginOffset
- BeginOffset
,
2619 LI
.replaceAllUsesWith(V
);
2620 Placeholder
->replaceAllUsesWith(&LI
);
2623 LI
.replaceAllUsesWith(V
);
2626 Pass
.DeadInsts
.insert(&LI
);
2627 deleteIfTriviallyDead(OldOp
);
2628 DEBUG(dbgs() << " to: " << *V
<< "\n");
2629 return !LI
.isVolatile() && !IsPtrAdjusted
;
2632 bool rewriteVectorizedStoreInst(Value
*V
, StoreInst
&SI
, Value
*OldOp
) {
2633 if (V
->getType() != VecTy
) {
2634 unsigned BeginIndex
= getIndex(NewBeginOffset
);
2635 unsigned EndIndex
= getIndex(NewEndOffset
);
2636 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2637 unsigned NumElements
= EndIndex
- BeginIndex
;
2638 assert(NumElements
<= VecTy
->getNumElements() && "Too many elements!");
2639 Type
*SliceTy
= (NumElements
== 1)
2641 : VectorType::get(ElementTy
, NumElements
);
2642 if (V
->getType() != SliceTy
)
2643 V
= convertValue(DL
, IRB
, V
, SliceTy
);
2645 // Mix in the existing elements.
2646 Value
*Old
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "load");
2647 V
= insertVector(IRB
, Old
, V
, BeginIndex
, "vec");
2649 StoreInst
*Store
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlignment());
2650 Pass
.DeadInsts
.insert(&SI
);
2653 DEBUG(dbgs() << " to: " << *Store
<< "\n");
2657 bool rewriteIntegerStore(Value
*V
, StoreInst
&SI
) {
2658 assert(IntTy
&& "We cannot extract an integer from the alloca");
2659 assert(!SI
.isVolatile());
2660 if (DL
.getTypeSizeInBits(V
->getType()) != IntTy
->getBitWidth()) {
2662 IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "oldload");
2663 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
2664 assert(BeginOffset
>= NewAllocaBeginOffset
&& "Out of bounds offset");
2665 uint64_t Offset
= BeginOffset
- NewAllocaBeginOffset
;
2666 V
= insertInteger(DL
, IRB
, Old
, SI
.getValueOperand(), Offset
, "insert");
2668 V
= convertValue(DL
, IRB
, V
, NewAllocaTy
);
2669 StoreInst
*Store
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlignment());
2670 Pass
.DeadInsts
.insert(&SI
);
2672 DEBUG(dbgs() << " to: " << *Store
<< "\n");
2676 bool visitStoreInst(StoreInst
&SI
) {
2677 DEBUG(dbgs() << " original: " << SI
<< "\n");
2678 Value
*OldOp
= SI
.getOperand(1);
2679 assert(OldOp
== OldPtr
);
2681 Value
*V
= SI
.getValueOperand();
2683 // Strip all inbounds GEPs and pointer casts to try to dig out any root
2684 // alloca that should be re-examined after promoting this alloca.
2685 if (V
->getType()->isPointerTy())
2686 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(V
->stripInBoundsOffsets()))
2687 Pass
.PostPromotionWorklist
.insert(AI
);
2689 if (SliceSize
< DL
.getTypeStoreSize(V
->getType())) {
2690 assert(!SI
.isVolatile());
2691 assert(V
->getType()->isIntegerTy() &&
2692 "Only integer type loads and stores are split");
2693 assert(V
->getType()->getIntegerBitWidth() ==
2694 DL
.getTypeStoreSizeInBits(V
->getType()) &&
2695 "Non-byte-multiple bit width");
2696 IntegerType
*NarrowTy
= Type::getIntNTy(SI
.getContext(), SliceSize
* 8);
2697 V
= extractInteger(DL
, IRB
, V
, NarrowTy
, NewBeginOffset
- BeginOffset
,
2702 return rewriteVectorizedStoreInst(V
, SI
, OldOp
);
2703 if (IntTy
&& V
->getType()->isIntegerTy())
2704 return rewriteIntegerStore(V
, SI
);
2707 if (NewBeginOffset
== NewAllocaBeginOffset
&&
2708 NewEndOffset
== NewAllocaEndOffset
&&
2709 canConvertValue(DL
, V
->getType(), NewAllocaTy
)) {
2710 V
= convertValue(DL
, IRB
, V
, NewAllocaTy
);
2711 NewSI
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlignment(),
2714 Value
*NewPtr
= getNewAllocaSlicePtr(IRB
, V
->getType()->getPointerTo());
2715 NewSI
= IRB
.CreateAlignedStore(V
, NewPtr
, getSliceAlign(V
->getType()),
2719 Pass
.DeadInsts
.insert(&SI
);
2720 deleteIfTriviallyDead(OldOp
);
2722 DEBUG(dbgs() << " to: " << *NewSI
<< "\n");
2723 return NewSI
->getPointerOperand() == &NewAI
&& !SI
.isVolatile();
2726 /// \brief Compute an integer value from splatting an i8 across the given
2727 /// number of bytes.
2729 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
2730 /// call this routine.
2731 /// FIXME: Heed the advice above.
2733 /// \param V The i8 value to splat.
2734 /// \param Size The number of bytes in the output (assuming i8 is one byte)
2735 Value
*getIntegerSplat(Value
*V
, unsigned Size
) {
2736 assert(Size
> 0 && "Expected a positive number of bytes.");
2737 IntegerType
*VTy
= cast
<IntegerType
>(V
->getType());
2738 assert(VTy
->getBitWidth() == 8 && "Expected an i8 value for the byte");
2742 Type
*SplatIntTy
= Type::getIntNTy(VTy
->getContext(), Size
* 8);
2744 IRB
.CreateZExt(V
, SplatIntTy
, "zext"),
2745 ConstantExpr::getUDiv(
2746 Constant::getAllOnesValue(SplatIntTy
),
2747 ConstantExpr::getZExt(Constant::getAllOnesValue(V
->getType()),
2753 /// \brief Compute a vector splat for a given element value.
2754 Value
*getVectorSplat(Value
*V
, unsigned NumElements
) {
2755 V
= IRB
.CreateVectorSplat(NumElements
, V
, "vsplat");
2756 DEBUG(dbgs() << " splat: " << *V
<< "\n");
2760 bool visitMemSetInst(MemSetInst
&II
) {
2761 DEBUG(dbgs() << " original: " << II
<< "\n");
2762 assert(II
.getRawDest() == OldPtr
);
2764 // If the memset has a variable size, it cannot be split, just adjust the
2765 // pointer to the new alloca.
2766 if (!isa
<Constant
>(II
.getLength())) {
2768 assert(NewBeginOffset
== BeginOffset
);
2769 II
.setDest(getNewAllocaSlicePtr(IRB
, OldPtr
->getType()));
2770 Type
*CstTy
= II
.getAlignmentCst()->getType();
2771 II
.setAlignment(ConstantInt::get(CstTy
, getSliceAlign()));
2773 deleteIfTriviallyDead(OldPtr
);
2777 // Record this instruction for deletion.
2778 Pass
.DeadInsts
.insert(&II
);
2780 Type
*AllocaTy
= NewAI
.getAllocatedType();
2781 Type
*ScalarTy
= AllocaTy
->getScalarType();
2783 // If this doesn't map cleanly onto the alloca type, and that type isn't
2784 // a single value type, just emit a memset.
2785 if (!VecTy
&& !IntTy
&&
2786 (BeginOffset
> NewAllocaBeginOffset
|| EndOffset
< NewAllocaEndOffset
||
2787 SliceSize
!= DL
.getTypeStoreSize(AllocaTy
) ||
2788 !AllocaTy
->isSingleValueType() ||
2789 !DL
.isLegalInteger(DL
.getTypeSizeInBits(ScalarTy
)) ||
2790 DL
.getTypeSizeInBits(ScalarTy
) % 8 != 0)) {
2791 Type
*SizeTy
= II
.getLength()->getType();
2792 Constant
*Size
= ConstantInt::get(SizeTy
, NewEndOffset
- NewBeginOffset
);
2793 CallInst
*New
= IRB
.CreateMemSet(
2794 getNewAllocaSlicePtr(IRB
, OldPtr
->getType()), II
.getValue(), Size
,
2795 getSliceAlign(), II
.isVolatile());
2797 DEBUG(dbgs() << " to: " << *New
<< "\n");
2801 // If we can represent this as a simple value, we have to build the actual
2802 // value to store, which requires expanding the byte present in memset to
2803 // a sensible representation for the alloca type. This is essentially
2804 // splatting the byte to a sufficiently wide integer, splatting it across
2805 // any desired vector width, and bitcasting to the final type.
2809 // If this is a memset of a vectorized alloca, insert it.
2810 assert(ElementTy
== ScalarTy
);
2812 unsigned BeginIndex
= getIndex(NewBeginOffset
);
2813 unsigned EndIndex
= getIndex(NewEndOffset
);
2814 assert(EndIndex
> BeginIndex
&& "Empty vector!");
2815 unsigned NumElements
= EndIndex
- BeginIndex
;
2816 assert(NumElements
<= VecTy
->getNumElements() && "Too many elements!");
2819 getIntegerSplat(II
.getValue(), DL
.getTypeSizeInBits(ElementTy
) / 8);
2820 Splat
= convertValue(DL
, IRB
, Splat
, ElementTy
);
2821 if (NumElements
> 1)
2822 Splat
= getVectorSplat(Splat
, NumElements
);
2825 IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "oldload");
2826 V
= insertVector(IRB
, Old
, Splat
, BeginIndex
, "vec");
2828 // If this is a memset on an alloca where we can widen stores, insert the
2830 assert(!II
.isVolatile());
2832 uint64_t Size
= NewEndOffset
- NewBeginOffset
;
2833 V
= getIntegerSplat(II
.getValue(), Size
);
2835 if (IntTy
&& (BeginOffset
!= NewAllocaBeginOffset
||
2836 EndOffset
!= NewAllocaBeginOffset
)) {
2838 IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "oldload");
2839 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
2840 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
2841 V
= insertInteger(DL
, IRB
, Old
, V
, Offset
, "insert");
2843 assert(V
->getType() == IntTy
&&
2844 "Wrong type for an alloca wide integer!");
2846 V
= convertValue(DL
, IRB
, V
, AllocaTy
);
2848 // Established these invariants above.
2849 assert(NewBeginOffset
== NewAllocaBeginOffset
);
2850 assert(NewEndOffset
== NewAllocaEndOffset
);
2852 V
= getIntegerSplat(II
.getValue(), DL
.getTypeSizeInBits(ScalarTy
) / 8);
2853 if (VectorType
*AllocaVecTy
= dyn_cast
<VectorType
>(AllocaTy
))
2854 V
= getVectorSplat(V
, AllocaVecTy
->getNumElements());
2856 V
= convertValue(DL
, IRB
, V
, AllocaTy
);
2859 Value
*New
= IRB
.CreateAlignedStore(V
, &NewAI
, NewAI
.getAlignment(),
2862 DEBUG(dbgs() << " to: " << *New
<< "\n");
2863 return !II
.isVolatile();
2866 bool visitMemTransferInst(MemTransferInst
&II
) {
2867 // Rewriting of memory transfer instructions can be a bit tricky. We break
2868 // them into two categories: split intrinsics and unsplit intrinsics.
2870 DEBUG(dbgs() << " original: " << II
<< "\n");
2872 bool IsDest
= &II
.getRawDestUse() == OldUse
;
2873 assert((IsDest
&& II
.getRawDest() == OldPtr
) ||
2874 (!IsDest
&& II
.getRawSource() == OldPtr
));
2876 unsigned SliceAlign
= getSliceAlign();
2878 // For unsplit intrinsics, we simply modify the source and destination
2879 // pointers in place. This isn't just an optimization, it is a matter of
2880 // correctness. With unsplit intrinsics we may be dealing with transfers
2881 // within a single alloca before SROA ran, or with transfers that have
2882 // a variable length. We may also be dealing with memmove instead of
2883 // memcpy, and so simply updating the pointers is the necessary for us to
2884 // update both source and dest of a single call.
2885 if (!IsSplittable
) {
2886 Value
*AdjustedPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
2888 II
.setDest(AdjustedPtr
);
2890 II
.setSource(AdjustedPtr
);
2892 if (II
.getAlignment() > SliceAlign
) {
2893 Type
*CstTy
= II
.getAlignmentCst()->getType();
2895 ConstantInt::get(CstTy
, MinAlign(II
.getAlignment(), SliceAlign
)));
2898 DEBUG(dbgs() << " to: " << II
<< "\n");
2899 deleteIfTriviallyDead(OldPtr
);
2902 // For split transfer intrinsics we have an incredibly useful assurance:
2903 // the source and destination do not reside within the same alloca, and at
2904 // least one of them does not escape. This means that we can replace
2905 // memmove with memcpy, and we don't need to worry about all manner of
2906 // downsides to splitting and transforming the operations.
2908 // If this doesn't map cleanly onto the alloca type, and that type isn't
2909 // a single value type, just emit a memcpy.
2912 (BeginOffset
> NewAllocaBeginOffset
|| EndOffset
< NewAllocaEndOffset
||
2913 SliceSize
!= DL
.getTypeStoreSize(NewAI
.getAllocatedType()) ||
2914 !NewAI
.getAllocatedType()->isSingleValueType());
2916 // If we're just going to emit a memcpy, the alloca hasn't changed, and the
2917 // size hasn't been shrunk based on analysis of the viable range, this is
2919 if (EmitMemCpy
&& &OldAI
== &NewAI
) {
2920 // Ensure the start lines up.
2921 assert(NewBeginOffset
== BeginOffset
);
2923 // Rewrite the size as needed.
2924 if (NewEndOffset
!= EndOffset
)
2925 II
.setLength(ConstantInt::get(II
.getLength()->getType(),
2926 NewEndOffset
- NewBeginOffset
));
2929 // Record this instruction for deletion.
2930 Pass
.DeadInsts
.insert(&II
);
2932 // Strip all inbounds GEPs and pointer casts to try to dig out any root
2933 // alloca that should be re-examined after rewriting this instruction.
2934 Value
*OtherPtr
= IsDest
? II
.getRawSource() : II
.getRawDest();
2935 if (AllocaInst
*AI
=
2936 dyn_cast
<AllocaInst
>(OtherPtr
->stripInBoundsOffsets())) {
2937 assert(AI
!= &OldAI
&& AI
!= &NewAI
&&
2938 "Splittable transfers cannot reach the same alloca on both ends.");
2939 Pass
.Worklist
.insert(AI
);
2942 Type
*OtherPtrTy
= OtherPtr
->getType();
2943 unsigned OtherAS
= OtherPtrTy
->getPointerAddressSpace();
2945 // Compute the relative offset for the other pointer within the transfer.
2946 unsigned IntPtrWidth
= DL
.getPointerSizeInBits(OtherAS
);
2947 APInt
OtherOffset(IntPtrWidth
, NewBeginOffset
- BeginOffset
);
2948 unsigned OtherAlign
= MinAlign(II
.getAlignment() ? II
.getAlignment() : 1,
2949 OtherOffset
.zextOrTrunc(64).getZExtValue());
2952 // Compute the other pointer, folding as much as possible to produce
2953 // a single, simple GEP in most cases.
2954 OtherPtr
= getAdjustedPtr(IRB
, DL
, OtherPtr
, OtherOffset
, OtherPtrTy
,
2955 OtherPtr
->getName() + ".");
2957 Value
*OurPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
2958 Type
*SizeTy
= II
.getLength()->getType();
2959 Constant
*Size
= ConstantInt::get(SizeTy
, NewEndOffset
- NewBeginOffset
);
2961 CallInst
*New
= IRB
.CreateMemCpy(
2962 IsDest
? OurPtr
: OtherPtr
, IsDest
? OtherPtr
: OurPtr
, Size
,
2963 MinAlign(SliceAlign
, OtherAlign
), II
.isVolatile());
2965 DEBUG(dbgs() << " to: " << *New
<< "\n");
2969 bool IsWholeAlloca
= NewBeginOffset
== NewAllocaBeginOffset
&&
2970 NewEndOffset
== NewAllocaEndOffset
;
2971 uint64_t Size
= NewEndOffset
- NewBeginOffset
;
2972 unsigned BeginIndex
= VecTy
? getIndex(NewBeginOffset
) : 0;
2973 unsigned EndIndex
= VecTy
? getIndex(NewEndOffset
) : 0;
2974 unsigned NumElements
= EndIndex
- BeginIndex
;
2975 IntegerType
*SubIntTy
=
2976 IntTy
? Type::getIntNTy(IntTy
->getContext(), Size
* 8) : nullptr;
2978 // Reset the other pointer type to match the register type we're going to
2979 // use, but using the address space of the original other pointer.
2980 if (VecTy
&& !IsWholeAlloca
) {
2981 if (NumElements
== 1)
2982 OtherPtrTy
= VecTy
->getElementType();
2984 OtherPtrTy
= VectorType::get(VecTy
->getElementType(), NumElements
);
2986 OtherPtrTy
= OtherPtrTy
->getPointerTo(OtherAS
);
2987 } else if (IntTy
&& !IsWholeAlloca
) {
2988 OtherPtrTy
= SubIntTy
->getPointerTo(OtherAS
);
2990 OtherPtrTy
= NewAllocaTy
->getPointerTo(OtherAS
);
2993 Value
*SrcPtr
= getAdjustedPtr(IRB
, DL
, OtherPtr
, OtherOffset
, OtherPtrTy
,
2994 OtherPtr
->getName() + ".");
2995 unsigned SrcAlign
= OtherAlign
;
2996 Value
*DstPtr
= &NewAI
;
2997 unsigned DstAlign
= SliceAlign
;
2999 std::swap(SrcPtr
, DstPtr
);
3000 std::swap(SrcAlign
, DstAlign
);
3004 if (VecTy
&& !IsWholeAlloca
&& !IsDest
) {
3005 Src
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "load");
3006 Src
= extractVector(IRB
, Src
, BeginIndex
, EndIndex
, "vec");
3007 } else if (IntTy
&& !IsWholeAlloca
&& !IsDest
) {
3008 Src
= IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "load");
3009 Src
= convertValue(DL
, IRB
, Src
, IntTy
);
3010 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
3011 Src
= extractInteger(DL
, IRB
, Src
, SubIntTy
, Offset
, "extract");
3014 IRB
.CreateAlignedLoad(SrcPtr
, SrcAlign
, II
.isVolatile(), "copyload");
3017 if (VecTy
&& !IsWholeAlloca
&& IsDest
) {
3019 IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "oldload");
3020 Src
= insertVector(IRB
, Old
, Src
, BeginIndex
, "vec");
3021 } else if (IntTy
&& !IsWholeAlloca
&& IsDest
) {
3023 IRB
.CreateAlignedLoad(&NewAI
, NewAI
.getAlignment(), "oldload");
3024 Old
= convertValue(DL
, IRB
, Old
, IntTy
);
3025 uint64_t Offset
= NewBeginOffset
- NewAllocaBeginOffset
;
3026 Src
= insertInteger(DL
, IRB
, Old
, Src
, Offset
, "insert");
3027 Src
= convertValue(DL
, IRB
, Src
, NewAllocaTy
);
3030 StoreInst
*Store
= cast
<StoreInst
>(
3031 IRB
.CreateAlignedStore(Src
, DstPtr
, DstAlign
, II
.isVolatile()));
3033 DEBUG(dbgs() << " to: " << *Store
<< "\n");
3034 return !II
.isVolatile();
3037 bool visitIntrinsicInst(IntrinsicInst
&II
) {
3038 assert(II
.getIntrinsicID() == Intrinsic::lifetime_start
||
3039 II
.getIntrinsicID() == Intrinsic::lifetime_end
);
3040 DEBUG(dbgs() << " original: " << II
<< "\n");
3041 assert(II
.getArgOperand(1) == OldPtr
);
3043 // Record this instruction for deletion.
3044 Pass
.DeadInsts
.insert(&II
);
3047 ConstantInt::get(cast
<IntegerType
>(II
.getArgOperand(0)->getType()),
3048 NewEndOffset
- NewBeginOffset
);
3049 Value
*Ptr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3051 if (II
.getIntrinsicID() == Intrinsic::lifetime_start
)
3052 New
= IRB
.CreateLifetimeStart(Ptr
, Size
);
3054 New
= IRB
.CreateLifetimeEnd(Ptr
, Size
);
3057 DEBUG(dbgs() << " to: " << *New
<< "\n");
3061 bool visitPHINode(PHINode
&PN
) {
3062 DEBUG(dbgs() << " original: " << PN
<< "\n");
3063 assert(BeginOffset
>= NewAllocaBeginOffset
&& "PHIs are unsplittable");
3064 assert(EndOffset
<= NewAllocaEndOffset
&& "PHIs are unsplittable");
3066 // We would like to compute a new pointer in only one place, but have it be
3067 // as local as possible to the PHI. To do that, we re-use the location of
3068 // the old pointer, which necessarily must be in the right position to
3069 // dominate the PHI.
3070 IRBuilderTy
PtrBuilder(IRB
);
3071 if (isa
<PHINode
>(OldPtr
))
3072 PtrBuilder
.SetInsertPoint(OldPtr
->getParent()->getFirstInsertionPt());
3074 PtrBuilder
.SetInsertPoint(OldPtr
);
3075 PtrBuilder
.SetCurrentDebugLocation(OldPtr
->getDebugLoc());
3077 Value
*NewPtr
= getNewAllocaSlicePtr(PtrBuilder
, OldPtr
->getType());
3078 // Replace the operands which were using the old pointer.
3079 std::replace(PN
.op_begin(), PN
.op_end(), cast
<Value
>(OldPtr
), NewPtr
);
3081 DEBUG(dbgs() << " to: " << PN
<< "\n");
3082 deleteIfTriviallyDead(OldPtr
);
3084 // PHIs can't be promoted on their own, but often can be speculated. We
3085 // check the speculation outside of the rewriter so that we see the
3086 // fully-rewritten alloca.
3087 PHIUsers
.insert(&PN
);
3091 bool visitSelectInst(SelectInst
&SI
) {
3092 DEBUG(dbgs() << " original: " << SI
<< "\n");
3093 assert((SI
.getTrueValue() == OldPtr
|| SI
.getFalseValue() == OldPtr
) &&
3094 "Pointer isn't an operand!");
3095 assert(BeginOffset
>= NewAllocaBeginOffset
&& "Selects are unsplittable");
3096 assert(EndOffset
<= NewAllocaEndOffset
&& "Selects are unsplittable");
3098 Value
*NewPtr
= getNewAllocaSlicePtr(IRB
, OldPtr
->getType());
3099 // Replace the operands which were using the old pointer.
3100 if (SI
.getOperand(1) == OldPtr
)
3101 SI
.setOperand(1, NewPtr
);
3102 if (SI
.getOperand(2) == OldPtr
)
3103 SI
.setOperand(2, NewPtr
);
3105 DEBUG(dbgs() << " to: " << SI
<< "\n");
3106 deleteIfTriviallyDead(OldPtr
);
3108 // Selects can't be promoted on their own, but often can be speculated. We
3109 // check the speculation outside of the rewriter so that we see the
3110 // fully-rewritten alloca.
3111 SelectUsers
.insert(&SI
);
3118 /// \brief Visitor to rewrite aggregate loads and stores as scalar.
3120 /// This pass aggressively rewrites all aggregate loads and stores on
3121 /// a particular pointer (or any pointer derived from it which we can identify)
3122 /// with scalar loads and stores.
3123 class AggLoadStoreRewriter
: public InstVisitor
<AggLoadStoreRewriter
, bool> {
3124 // Befriend the base class so it can delegate to private visit methods.
3125 friend class llvm::InstVisitor
<AggLoadStoreRewriter
, bool>;
3127 const DataLayout
&DL
;
3129 /// Queue of pointer uses to analyze and potentially rewrite.
3130 SmallVector
<Use
*, 8> Queue
;
3132 /// Set to prevent us from cycling with phi nodes and loops.
3133 SmallPtrSet
<User
*, 8> Visited
;
3135 /// The current pointer use being rewritten. This is used to dig up the used
3136 /// value (as opposed to the user).
3140 AggLoadStoreRewriter(const DataLayout
&DL
) : DL(DL
) {}
3142 /// Rewrite loads and stores through a pointer and all pointers derived from
3144 bool rewrite(Instruction
&I
) {
3145 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
3147 bool Changed
= false;
3148 while (!Queue
.empty()) {
3149 U
= Queue
.pop_back_val();
3150 Changed
|= visit(cast
<Instruction
>(U
->getUser()));
3156 /// Enqueue all the users of the given instruction for further processing.
3157 /// This uses a set to de-duplicate users.
3158 void enqueueUsers(Instruction
&I
) {
3159 for (Use
&U
: I
.uses())
3160 if (Visited
.insert(U
.getUser()).second
)
3161 Queue
.push_back(&U
);
3164 // Conservative default is to not rewrite anything.
3165 bool visitInstruction(Instruction
&I
) { return false; }
3167 /// \brief Generic recursive split emission class.
3168 template <typename Derived
> class OpSplitter
{
3170 /// The builder used to form new instructions.
3172 /// The indices which to be used with insert- or extractvalue to select the
3173 /// appropriate value within the aggregate.
3174 SmallVector
<unsigned, 4> Indices
;
3175 /// The indices to a GEP instruction which will move Ptr to the correct slot
3176 /// within the aggregate.
3177 SmallVector
<Value
*, 4> GEPIndices
;
3178 /// The base pointer of the original op, used as a base for GEPing the
3179 /// split operations.
3182 /// Initialize the splitter with an insertion point, Ptr and start with a
3183 /// single zero GEP index.
3184 OpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
)
3185 : IRB(InsertionPoint
), GEPIndices(1, IRB
.getInt32(0)), Ptr(Ptr
) {}
3188 /// \brief Generic recursive split emission routine.
3190 /// This method recursively splits an aggregate op (load or store) into
3191 /// scalar or vector ops. It splits recursively until it hits a single value
3192 /// and emits that single value operation via the template argument.
3194 /// The logic of this routine relies on GEPs and insertvalue and
3195 /// extractvalue all operating with the same fundamental index list, merely
3196 /// formatted differently (GEPs need actual values).
3198 /// \param Ty The type being split recursively into smaller ops.
3199 /// \param Agg The aggregate value being built up or stored, depending on
3200 /// whether this is splitting a load or a store respectively.
3201 void emitSplitOps(Type
*Ty
, Value
*&Agg
, const Twine
&Name
) {
3202 if (Ty
->isSingleValueType())
3203 return static_cast<Derived
*>(this)->emitFunc(Ty
, Agg
, Name
);
3205 if (ArrayType
*ATy
= dyn_cast
<ArrayType
>(Ty
)) {
3206 unsigned OldSize
= Indices
.size();
3208 for (unsigned Idx
= 0, Size
= ATy
->getNumElements(); Idx
!= Size
;
3210 assert(Indices
.size() == OldSize
&& "Did not return to the old size");
3211 Indices
.push_back(Idx
);
3212 GEPIndices
.push_back(IRB
.getInt32(Idx
));
3213 emitSplitOps(ATy
->getElementType(), Agg
, Name
+ "." + Twine(Idx
));
3214 GEPIndices
.pop_back();
3220 if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
3221 unsigned OldSize
= Indices
.size();
3223 for (unsigned Idx
= 0, Size
= STy
->getNumElements(); Idx
!= Size
;
3225 assert(Indices
.size() == OldSize
&& "Did not return to the old size");
3226 Indices
.push_back(Idx
);
3227 GEPIndices
.push_back(IRB
.getInt32(Idx
));
3228 emitSplitOps(STy
->getElementType(Idx
), Agg
, Name
+ "." + Twine(Idx
));
3229 GEPIndices
.pop_back();
3235 llvm_unreachable("Only arrays and structs are aggregate loadable types");
3239 struct LoadOpSplitter
: public OpSplitter
<LoadOpSplitter
> {
3240 LoadOpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
)
3241 : OpSplitter
<LoadOpSplitter
>(InsertionPoint
, Ptr
) {}
3243 /// Emit a leaf load of a single value. This is called at the leaves of the
3244 /// recursive emission to actually load values.
3245 void emitFunc(Type
*Ty
, Value
*&Agg
, const Twine
&Name
) {
3246 assert(Ty
->isSingleValueType());
3247 // Load the single value and insert it using the indices.
3248 Value
*GEP
= IRB
.CreateInBoundsGEP(Ptr
, GEPIndices
, Name
+ ".gep");
3249 Value
*Load
= IRB
.CreateLoad(GEP
, Name
+ ".load");
3250 Agg
= IRB
.CreateInsertValue(Agg
, Load
, Indices
, Name
+ ".insert");
3251 DEBUG(dbgs() << " to: " << *Load
<< "\n");
3255 bool visitLoadInst(LoadInst
&LI
) {
3256 assert(LI
.getPointerOperand() == *U
);
3257 if (!LI
.isSimple() || LI
.getType()->isSingleValueType())
3260 // We have an aggregate being loaded, split it apart.
3261 DEBUG(dbgs() << " original: " << LI
<< "\n");
3262 LoadOpSplitter
Splitter(&LI
, *U
);
3263 Value
*V
= UndefValue::get(LI
.getType());
3264 Splitter
.emitSplitOps(LI
.getType(), V
, LI
.getName() + ".fca");
3265 LI
.replaceAllUsesWith(V
);
3266 LI
.eraseFromParent();
3270 struct StoreOpSplitter
: public OpSplitter
<StoreOpSplitter
> {
3271 StoreOpSplitter(Instruction
*InsertionPoint
, Value
*Ptr
)
3272 : OpSplitter
<StoreOpSplitter
>(InsertionPoint
, Ptr
) {}
3274 /// Emit a leaf store of a single value. This is called at the leaves of the
3275 /// recursive emission to actually produce stores.
3276 void emitFunc(Type
*Ty
, Value
*&Agg
, const Twine
&Name
) {
3277 assert(Ty
->isSingleValueType());
3278 // Extract the single value and store it using the indices.
3279 Value
*Store
= IRB
.CreateStore(
3280 IRB
.CreateExtractValue(Agg
, Indices
, Name
+ ".extract"),
3281 IRB
.CreateInBoundsGEP(Ptr
, GEPIndices
, Name
+ ".gep"));
3283 DEBUG(dbgs() << " to: " << *Store
<< "\n");
3287 bool visitStoreInst(StoreInst
&SI
) {
3288 if (!SI
.isSimple() || SI
.getPointerOperand() != *U
)
3290 Value
*V
= SI
.getValueOperand();
3291 if (V
->getType()->isSingleValueType())
3294 // We have an aggregate being stored, split it apart.
3295 DEBUG(dbgs() << " original: " << SI
<< "\n");
3296 StoreOpSplitter
Splitter(&SI
, *U
);
3297 Splitter
.emitSplitOps(V
->getType(), V
, V
->getName() + ".fca");
3298 SI
.eraseFromParent();
3302 bool visitBitCastInst(BitCastInst
&BC
) {
3307 bool visitGetElementPtrInst(GetElementPtrInst
&GEPI
) {
3312 bool visitPHINode(PHINode
&PN
) {
3317 bool visitSelectInst(SelectInst
&SI
) {
3324 /// \brief Strip aggregate type wrapping.
3326 /// This removes no-op aggregate types wrapping an underlying type. It will
3327 /// strip as many layers of types as it can without changing either the type
3328 /// size or the allocated size.
3329 static Type
*stripAggregateTypeWrapping(const DataLayout
&DL
, Type
*Ty
) {
3330 if (Ty
->isSingleValueType())
3333 uint64_t AllocSize
= DL
.getTypeAllocSize(Ty
);
3334 uint64_t TypeSize
= DL
.getTypeSizeInBits(Ty
);
3337 if (ArrayType
*ArrTy
= dyn_cast
<ArrayType
>(Ty
)) {
3338 InnerTy
= ArrTy
->getElementType();
3339 } else if (StructType
*STy
= dyn_cast
<StructType
>(Ty
)) {
3340 const StructLayout
*SL
= DL
.getStructLayout(STy
);
3341 unsigned Index
= SL
->getElementContainingOffset(0);
3342 InnerTy
= STy
->getElementType(Index
);
3347 if (AllocSize
> DL
.getTypeAllocSize(InnerTy
) ||
3348 TypeSize
> DL
.getTypeSizeInBits(InnerTy
))
3351 return stripAggregateTypeWrapping(DL
, InnerTy
);
3354 /// \brief Try to find a partition of the aggregate type passed in for a given
3355 /// offset and size.
3357 /// This recurses through the aggregate type and tries to compute a subtype
3358 /// based on the offset and size. When the offset and size span a sub-section
3359 /// of an array, it will even compute a new array type for that sub-section,
3360 /// and the same for structs.
3362 /// Note that this routine is very strict and tries to find a partition of the
3363 /// type which produces the *exact* right offset and size. It is not forgiving
3364 /// when the size or offset cause either end of type-based partition to be off.
3365 /// Also, this is a best-effort routine. It is reasonable to give up and not
3366 /// return a type if necessary.
3367 static Type
*getTypePartition(const DataLayout
&DL
, Type
*Ty
, uint64_t Offset
,
3369 if (Offset
== 0 && DL
.getTypeAllocSize(Ty
) == Size
)
3370 return stripAggregateTypeWrapping(DL
, Ty
);
3371 if (Offset
> DL
.getTypeAllocSize(Ty
) ||
3372 (DL
.getTypeAllocSize(Ty
) - Offset
) < Size
)
3375 if (SequentialType
*SeqTy
= dyn_cast
<SequentialType
>(Ty
)) {
3376 // We can't partition pointers...
3377 if (SeqTy
->isPointerTy())
3380 Type
*ElementTy
= SeqTy
->getElementType();
3381 uint64_t ElementSize
= DL
.getTypeAllocSize(ElementTy
);
3382 uint64_t NumSkippedElements
= Offset
/ ElementSize
;
3383 if (ArrayType
*ArrTy
= dyn_cast
<ArrayType
>(SeqTy
)) {
3384 if (NumSkippedElements
>= ArrTy
->getNumElements())
3386 } else if (VectorType
*VecTy
= dyn_cast
<VectorType
>(SeqTy
)) {
3387 if (NumSkippedElements
>= VecTy
->getNumElements())
3390 Offset
-= NumSkippedElements
* ElementSize
;
3392 // First check if we need to recurse.
3393 if (Offset
> 0 || Size
< ElementSize
) {
3394 // Bail if the partition ends in a different array element.
3395 if ((Offset
+ Size
) > ElementSize
)
3397 // Recurse through the element type trying to peel off offset bytes.
3398 return getTypePartition(DL
, ElementTy
, Offset
, Size
);
3400 assert(Offset
== 0);
3402 if (Size
== ElementSize
)
3403 return stripAggregateTypeWrapping(DL
, ElementTy
);
3404 assert(Size
> ElementSize
);
3405 uint64_t NumElements
= Size
/ ElementSize
;
3406 if (NumElements
* ElementSize
!= Size
)
3408 return ArrayType::get(ElementTy
, NumElements
);
3411 StructType
*STy
= dyn_cast
<StructType
>(Ty
);
3415 const StructLayout
*SL
= DL
.getStructLayout(STy
);
3416 if (Offset
>= SL
->getSizeInBytes())
3418 uint64_t EndOffset
= Offset
+ Size
;
3419 if (EndOffset
> SL
->getSizeInBytes())
3422 unsigned Index
= SL
->getElementContainingOffset(Offset
);
3423 Offset
-= SL
->getElementOffset(Index
);
3425 Type
*ElementTy
= STy
->getElementType(Index
);
3426 uint64_t ElementSize
= DL
.getTypeAllocSize(ElementTy
);
3427 if (Offset
>= ElementSize
)
3428 return nullptr; // The offset points into alignment padding.
3430 // See if any partition must be contained by the element.
3431 if (Offset
> 0 || Size
< ElementSize
) {
3432 if ((Offset
+ Size
) > ElementSize
)
3434 return getTypePartition(DL
, ElementTy
, Offset
, Size
);
3436 assert(Offset
== 0);
3438 if (Size
== ElementSize
)
3439 return stripAggregateTypeWrapping(DL
, ElementTy
);
3441 StructType::element_iterator EI
= STy
->element_begin() + Index
,
3442 EE
= STy
->element_end();
3443 if (EndOffset
< SL
->getSizeInBytes()) {
3444 unsigned EndIndex
= SL
->getElementContainingOffset(EndOffset
);
3445 if (Index
== EndIndex
)
3446 return nullptr; // Within a single element and its padding.
3448 // Don't try to form "natural" types if the elements don't line up with the
3450 // FIXME: We could potentially recurse down through the last element in the
3451 // sub-struct to find a natural end point.
3452 if (SL
->getElementOffset(EndIndex
) != EndOffset
)
3455 assert(Index
< EndIndex
);
3456 EE
= STy
->element_begin() + EndIndex
;
3459 // Try to build up a sub-structure.
3461 StructType::get(STy
->getContext(), makeArrayRef(EI
, EE
), STy
->isPacked());
3462 const StructLayout
*SubSL
= DL
.getStructLayout(SubTy
);
3463 if (Size
!= SubSL
->getSizeInBytes())
3464 return nullptr; // The sub-struct doesn't have quite the size needed.
3469 /// \brief Pre-split loads and stores to simplify rewriting.
3471 /// We want to break up the splittable load+store pairs as much as
3472 /// possible. This is important to do as a preprocessing step, as once we
3473 /// start rewriting the accesses to partitions of the alloca we lose the
3474 /// necessary information to correctly split apart paired loads and stores
3475 /// which both point into this alloca. The case to consider is something like
3478 /// %a = alloca [12 x i8]
3479 /// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
3480 /// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
3481 /// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
3482 /// %iptr1 = bitcast i8* %gep1 to i64*
3483 /// %iptr2 = bitcast i8* %gep2 to i64*
3484 /// %fptr1 = bitcast i8* %gep1 to float*
3485 /// %fptr2 = bitcast i8* %gep2 to float*
3486 /// %fptr3 = bitcast i8* %gep3 to float*
3487 /// store float 0.0, float* %fptr1
3488 /// store float 1.0, float* %fptr2
3489 /// %v = load i64* %iptr1
3490 /// store i64 %v, i64* %iptr2
3491 /// %f1 = load float* %fptr2
3492 /// %f2 = load float* %fptr3
3494 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
3495 /// promote everything so we recover the 2 SSA values that should have been
3496 /// there all along.
3498 /// \returns true if any changes are made.
3499 bool SROA::presplitLoadsAndStores(AllocaInst
&AI
, AllocaSlices
&AS
) {
3500 DEBUG(dbgs() << "Pre-splitting loads and stores\n");
3502 // Track the loads and stores which are candidates for pre-splitting here, in
3503 // the order they first appear during the partition scan. These give stable
3504 // iteration order and a basis for tracking which loads and stores we
3506 SmallVector
<LoadInst
*, 4> Loads
;
3507 SmallVector
<StoreInst
*, 4> Stores
;
3509 // We need to accumulate the splits required of each load or store where we
3510 // can find them via a direct lookup. This is important to cross-check loads
3511 // and stores against each other. We also track the slice so that we can kill
3512 // all the slices that end up split.
3513 struct SplitOffsets
{
3515 std::vector
<uint64_t> Splits
;
3517 SmallDenseMap
<Instruction
*, SplitOffsets
, 8> SplitOffsetsMap
;
3519 // Track loads out of this alloca which cannot, for any reason, be pre-split.
3520 // This is important as we also cannot pre-split stores of those loads!
3521 // FIXME: This is all pretty gross. It means that we can be more aggressive
3522 // in pre-splitting when the load feeding the store happens to come from
3523 // a separate alloca. Put another way, the effectiveness of SROA would be
3524 // decreased by a frontend which just concatenated all of its local allocas
3525 // into one big flat alloca. But defeating such patterns is exactly the job
3526 // SROA is tasked with! Sadly, to not have this discrepancy we would have
3527 // change store pre-splitting to actually force pre-splitting of the load
3528 // that feeds it *and all stores*. That makes pre-splitting much harder, but
3529 // maybe it would make it more principled?
3530 SmallPtrSet
<LoadInst
*, 8> UnsplittableLoads
;
3532 DEBUG(dbgs() << " Searching for candidate loads and stores\n");
3533 for (auto &P
: AS
.partitions()) {
3534 for (Slice
&S
: P
) {
3535 Instruction
*I
= cast
<Instruction
>(S
.getUse()->getUser());
3536 if (!S
.isSplittable() ||S
.endOffset() <= P
.endOffset()) {
3537 // If this was a load we have to track that it can't participate in any
3539 if (auto *LI
= dyn_cast
<LoadInst
>(I
))
3540 UnsplittableLoads
.insert(LI
);
3543 assert(P
.endOffset() > S
.beginOffset() &&
3544 "Empty or backwards partition!");
3546 // Determine if this is a pre-splittable slice.
3547 if (auto *LI
= dyn_cast
<LoadInst
>(I
)) {
3548 assert(!LI
->isVolatile() && "Cannot split volatile loads!");
3550 // The load must be used exclusively to store into other pointers for
3551 // us to be able to arbitrarily pre-split it. The stores must also be
3552 // simple to avoid changing semantics.
3553 auto IsLoadSimplyStored
= [](LoadInst
*LI
) {
3554 for (User
*LU
: LI
->users()) {
3555 auto *SI
= dyn_cast
<StoreInst
>(LU
);
3556 if (!SI
|| !SI
->isSimple())
3561 if (!IsLoadSimplyStored(LI
)) {
3562 UnsplittableLoads
.insert(LI
);
3566 Loads
.push_back(LI
);
3567 } else if (auto *SI
= dyn_cast
<StoreInst
>(S
.getUse()->getUser())) {
3569 S
.getUse() != &SI
->getOperandUse(SI
->getPointerOperandIndex()))
3571 auto *StoredLoad
= dyn_cast
<LoadInst
>(SI
->getValueOperand());
3572 if (!StoredLoad
|| !StoredLoad
->isSimple())
3574 assert(!SI
->isVolatile() && "Cannot split volatile stores!");
3576 Stores
.push_back(SI
);
3578 // Other uses cannot be pre-split.
3582 // Record the initial split.
3583 DEBUG(dbgs() << " Candidate: " << *I
<< "\n");
3584 auto &Offsets
= SplitOffsetsMap
[I
];
3585 assert(Offsets
.Splits
.empty() &&
3586 "Should not have splits the first time we see an instruction!");
3588 Offsets
.Splits
.push_back(P
.endOffset() - S
.beginOffset());
3591 // Now scan the already split slices, and add a split for any of them which
3592 // we're going to pre-split.
3593 for (Slice
*S
: P
.splitSliceTails()) {
3594 auto SplitOffsetsMapI
=
3595 SplitOffsetsMap
.find(cast
<Instruction
>(S
->getUse()->getUser()));
3596 if (SplitOffsetsMapI
== SplitOffsetsMap
.end())
3598 auto &Offsets
= SplitOffsetsMapI
->second
;
3600 assert(Offsets
.S
== S
&& "Found a mismatched slice!");
3601 assert(!Offsets
.Splits
.empty() &&
3602 "Cannot have an empty set of splits on the second partition!");
3603 assert(Offsets
.Splits
.back() ==
3604 P
.beginOffset() - Offsets
.S
->beginOffset() &&
3605 "Previous split does not end where this one begins!");
3607 // Record each split. The last partition's end isn't needed as the size
3608 // of the slice dictates that.
3609 if (S
->endOffset() > P
.endOffset())
3610 Offsets
.Splits
.push_back(P
.endOffset() - Offsets
.S
->beginOffset());
3614 // We may have split loads where some of their stores are split stores. For
3615 // such loads and stores, we can only pre-split them if their splits exactly
3616 // match relative to their starting offset. We have to verify this prior to
3619 std::remove_if(Stores
.begin(), Stores
.end(),
3620 [&UnsplittableLoads
, &SplitOffsetsMap
](StoreInst
*SI
) {
3621 // Lookup the load we are storing in our map of split
3623 auto *LI
= cast
<LoadInst
>(SI
->getValueOperand());
3624 // If it was completely unsplittable, then we're done,
3625 // and this store can't be pre-split.
3626 if (UnsplittableLoads
.count(LI
))
3629 auto LoadOffsetsI
= SplitOffsetsMap
.find(LI
);
3630 if (LoadOffsetsI
== SplitOffsetsMap
.end())
3631 return false; // Unrelated loads are definitely safe.
3632 auto &LoadOffsets
= LoadOffsetsI
->second
;
3634 // Now lookup the store's offsets.
3635 auto &StoreOffsets
= SplitOffsetsMap
[SI
];
3637 // If the relative offsets of each split in the load and
3638 // store match exactly, then we can split them and we
3639 // don't need to remove them here.
3640 if (LoadOffsets
.Splits
== StoreOffsets
.Splits
)
3644 << " Mismatched splits for load and store:\n"
3645 << " " << *LI
<< "\n"
3646 << " " << *SI
<< "\n");
3648 // We've found a store and load that we need to split
3649 // with mismatched relative splits. Just give up on them
3650 // and remove both instructions from our list of
3652 UnsplittableLoads
.insert(LI
);
3656 // Now we have to go *back* through all te stores, because a later store may
3657 // have caused an earlier store's load to become unsplittable and if it is
3658 // unsplittable for the later store, then we can't rely on it being split in
3659 // the earlier store either.
3660 Stores
.erase(std::remove_if(Stores
.begin(), Stores
.end(),
3661 [&UnsplittableLoads
](StoreInst
*SI
) {
3663 cast
<LoadInst
>(SI
->getValueOperand());
3664 return UnsplittableLoads
.count(LI
);
3667 // Once we've established all the loads that can't be split for some reason,
3668 // filter any that made it into our list out.
3669 Loads
.erase(std::remove_if(Loads
.begin(), Loads
.end(),
3670 [&UnsplittableLoads
](LoadInst
*LI
) {
3671 return UnsplittableLoads
.count(LI
);
3676 // If no loads or stores are left, there is no pre-splitting to be done for
3678 if (Loads
.empty() && Stores
.empty())
3681 // From here on, we can't fail and will be building new accesses, so rig up
3683 IRBuilderTy
IRB(&AI
);
3685 // Collect the new slices which we will merge into the alloca slices.
3686 SmallVector
<Slice
, 4> NewSlices
;
3688 // Track any allocas we end up splitting loads and stores for so we iterate
3690 SmallPtrSet
<AllocaInst
*, 4> ResplitPromotableAllocas
;
3692 // At this point, we have collected all of the loads and stores we can
3693 // pre-split, and the specific splits needed for them. We actually do the
3694 // splitting in a specific order in order to handle when one of the loads in
3695 // the value operand to one of the stores.
3697 // First, we rewrite all of the split loads, and just accumulate each split
3698 // load in a parallel structure. We also build the slices for them and append
3699 // them to the alloca slices.
3700 SmallDenseMap
<LoadInst
*, std::vector
<LoadInst
*>, 1> SplitLoadsMap
;
3701 std::vector
<LoadInst
*> SplitLoads
;
3702 for (LoadInst
*LI
: Loads
) {
3705 IntegerType
*Ty
= cast
<IntegerType
>(LI
->getType());
3706 uint64_t LoadSize
= Ty
->getBitWidth() / 8;
3707 assert(LoadSize
> 0 && "Cannot have a zero-sized integer load!");
3709 auto &Offsets
= SplitOffsetsMap
[LI
];
3710 assert(LoadSize
== Offsets
.S
->endOffset() - Offsets
.S
->beginOffset() &&
3711 "Slice size should always match load size exactly!");
3712 uint64_t BaseOffset
= Offsets
.S
->beginOffset();
3713 assert(BaseOffset
+ LoadSize
> BaseOffset
&&
3714 "Cannot represent alloca access size using 64-bit integers!");
3716 Instruction
*BasePtr
= cast
<Instruction
>(LI
->getPointerOperand());
3717 IRB
.SetInsertPoint(BasicBlock::iterator(LI
));
3719 DEBUG(dbgs() << " Splitting load: " << *LI
<< "\n");
3721 uint64_t PartOffset
= 0, PartSize
= Offsets
.Splits
.front();
3722 int Idx
= 0, Size
= Offsets
.Splits
.size();
3724 auto *PartTy
= Type::getIntNTy(Ty
->getContext(), PartSize
* 8);
3725 auto *PartPtrTy
= PartTy
->getPointerTo(LI
->getPointerAddressSpace());
3726 LoadInst
*PLoad
= IRB
.CreateAlignedLoad(
3727 getAdjustedPtr(IRB
, *DL
, BasePtr
,
3728 APInt(DL
->getPointerSizeInBits(), PartOffset
),
3729 PartPtrTy
, BasePtr
->getName() + "."),
3730 getAdjustedAlignment(LI
, PartOffset
, *DL
), /*IsVolatile*/ false,
3733 // Append this load onto the list of split loads so we can find it later
3734 // to rewrite the stores.
3735 SplitLoads
.push_back(PLoad
);
3737 // Now build a new slice for the alloca.
3738 NewSlices
.push_back(
3739 Slice(BaseOffset
+ PartOffset
, BaseOffset
+ PartOffset
+ PartSize
,
3740 &PLoad
->getOperandUse(PLoad
->getPointerOperandIndex()),
3741 /*IsSplittable*/ false));
3742 DEBUG(dbgs() << " new slice [" << NewSlices
.back().beginOffset()
3743 << ", " << NewSlices
.back().endOffset() << "): " << *PLoad
3746 // See if we've handled all the splits.
3750 // Setup the next partition.
3751 PartOffset
= Offsets
.Splits
[Idx
];
3753 PartSize
= (Idx
< Size
? Offsets
.Splits
[Idx
] : LoadSize
) - PartOffset
;
3756 // Now that we have the split loads, do the slow walk over all uses of the
3757 // load and rewrite them as split stores, or save the split loads to use
3758 // below if the store is going to be split there anyways.
3759 bool DeferredStores
= false;
3760 for (User
*LU
: LI
->users()) {
3761 StoreInst
*SI
= cast
<StoreInst
>(LU
);
3762 if (!Stores
.empty() && SplitOffsetsMap
.count(SI
)) {
3763 DeferredStores
= true;
3764 DEBUG(dbgs() << " Deferred splitting of store: " << *SI
<< "\n");
3768 Value
*StoreBasePtr
= SI
->getPointerOperand();
3769 IRB
.SetInsertPoint(BasicBlock::iterator(SI
));
3771 DEBUG(dbgs() << " Splitting store of load: " << *SI
<< "\n");
3773 for (int Idx
= 0, Size
= SplitLoads
.size(); Idx
< Size
; ++Idx
) {
3774 LoadInst
*PLoad
= SplitLoads
[Idx
];
3775 uint64_t PartOffset
= Idx
== 0 ? 0 : Offsets
.Splits
[Idx
- 1];
3777 PLoad
->getType()->getPointerTo(SI
->getPointerAddressSpace());
3779 StoreInst
*PStore
= IRB
.CreateAlignedStore(
3780 PLoad
, getAdjustedPtr(IRB
, *DL
, StoreBasePtr
,
3781 APInt(DL
->getPointerSizeInBits(), PartOffset
),
3782 PartPtrTy
, StoreBasePtr
->getName() + "."),
3783 getAdjustedAlignment(SI
, PartOffset
, *DL
), /*IsVolatile*/ false);
3785 DEBUG(dbgs() << " +" << PartOffset
<< ":" << *PStore
<< "\n");
3788 // We want to immediately iterate on any allocas impacted by splitting
3789 // this store, and we have to track any promotable alloca (indicated by
3790 // a direct store) as needing to be resplit because it is no longer
3792 if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(StoreBasePtr
)) {
3793 ResplitPromotableAllocas
.insert(OtherAI
);
3794 Worklist
.insert(OtherAI
);
3795 } else if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(
3796 StoreBasePtr
->stripInBoundsOffsets())) {
3797 Worklist
.insert(OtherAI
);
3800 // Mark the original store as dead.
3801 DeadInsts
.insert(SI
);
3804 // Save the split loads if there are deferred stores among the users.
3806 SplitLoadsMap
.insert(std::make_pair(LI
, std::move(SplitLoads
)));
3808 // Mark the original load as dead and kill the original slice.
3809 DeadInsts
.insert(LI
);
3813 // Second, we rewrite all of the split stores. At this point, we know that
3814 // all loads from this alloca have been split already. For stores of such
3815 // loads, we can simply look up the pre-existing split loads. For stores of
3816 // other loads, we split those loads first and then write split stores of
3818 for (StoreInst
*SI
: Stores
) {
3819 auto *LI
= cast
<LoadInst
>(SI
->getValueOperand());
3820 IntegerType
*Ty
= cast
<IntegerType
>(LI
->getType());
3821 uint64_t StoreSize
= Ty
->getBitWidth() / 8;
3822 assert(StoreSize
> 0 && "Cannot have a zero-sized integer store!");
3824 auto &Offsets
= SplitOffsetsMap
[SI
];
3825 assert(StoreSize
== Offsets
.S
->endOffset() - Offsets
.S
->beginOffset() &&
3826 "Slice size should always match load size exactly!");
3827 uint64_t BaseOffset
= Offsets
.S
->beginOffset();
3828 assert(BaseOffset
+ StoreSize
> BaseOffset
&&
3829 "Cannot represent alloca access size using 64-bit integers!");
3831 Value
*LoadBasePtr
= LI
->getPointerOperand();
3832 Instruction
*StoreBasePtr
= cast
<Instruction
>(SI
->getPointerOperand());
3834 DEBUG(dbgs() << " Splitting store: " << *SI
<< "\n");
3836 // Check whether we have an already split load.
3837 auto SplitLoadsMapI
= SplitLoadsMap
.find(LI
);
3838 std::vector
<LoadInst
*> *SplitLoads
= nullptr;
3839 if (SplitLoadsMapI
!= SplitLoadsMap
.end()) {
3840 SplitLoads
= &SplitLoadsMapI
->second
;
3841 assert(SplitLoads
->size() == Offsets
.Splits
.size() + 1 &&
3842 "Too few split loads for the number of splits in the store!");
3844 DEBUG(dbgs() << " of load: " << *LI
<< "\n");
3847 uint64_t PartOffset
= 0, PartSize
= Offsets
.Splits
.front();
3848 int Idx
= 0, Size
= Offsets
.Splits
.size();
3850 auto *PartTy
= Type::getIntNTy(Ty
->getContext(), PartSize
* 8);
3851 auto *PartPtrTy
= PartTy
->getPointerTo(SI
->getPointerAddressSpace());
3853 // Either lookup a split load or create one.
3856 PLoad
= (*SplitLoads
)[Idx
];
3858 IRB
.SetInsertPoint(BasicBlock::iterator(LI
));
3859 PLoad
= IRB
.CreateAlignedLoad(
3860 getAdjustedPtr(IRB
, *DL
, LoadBasePtr
,
3861 APInt(DL
->getPointerSizeInBits(), PartOffset
),
3862 PartPtrTy
, LoadBasePtr
->getName() + "."),
3863 getAdjustedAlignment(LI
, PartOffset
, *DL
), /*IsVolatile*/ false,
3867 // And store this partition.
3868 IRB
.SetInsertPoint(BasicBlock::iterator(SI
));
3869 StoreInst
*PStore
= IRB
.CreateAlignedStore(
3870 PLoad
, getAdjustedPtr(IRB
, *DL
, StoreBasePtr
,
3871 APInt(DL
->getPointerSizeInBits(), PartOffset
),
3872 PartPtrTy
, StoreBasePtr
->getName() + "."),
3873 getAdjustedAlignment(SI
, PartOffset
, *DL
), /*IsVolatile*/ false);
3875 // Now build a new slice for the alloca.
3876 NewSlices
.push_back(
3877 Slice(BaseOffset
+ PartOffset
, BaseOffset
+ PartOffset
+ PartSize
,
3878 &PStore
->getOperandUse(PStore
->getPointerOperandIndex()),
3879 /*IsSplittable*/ false));
3880 DEBUG(dbgs() << " new slice [" << NewSlices
.back().beginOffset()
3881 << ", " << NewSlices
.back().endOffset() << "): " << *PStore
3884 DEBUG(dbgs() << " of split load: " << *PLoad
<< "\n");
3887 // See if we've finished all the splits.
3891 // Setup the next partition.
3892 PartOffset
= Offsets
.Splits
[Idx
];
3894 PartSize
= (Idx
< Size
? Offsets
.Splits
[Idx
] : StoreSize
) - PartOffset
;
3897 // We want to immediately iterate on any allocas impacted by splitting
3898 // this load, which is only relevant if it isn't a load of this alloca and
3899 // thus we didn't already split the loads above. We also have to keep track
3900 // of any promotable allocas we split loads on as they can no longer be
3903 if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(LoadBasePtr
)) {
3904 assert(OtherAI
!= &AI
&& "We can't re-split our own alloca!");
3905 ResplitPromotableAllocas
.insert(OtherAI
);
3906 Worklist
.insert(OtherAI
);
3907 } else if (AllocaInst
*OtherAI
= dyn_cast
<AllocaInst
>(
3908 LoadBasePtr
->stripInBoundsOffsets())) {
3909 assert(OtherAI
!= &AI
&& "We can't re-split our own alloca!");
3910 Worklist
.insert(OtherAI
);
3914 // Mark the original store as dead now that we've split it up and kill its
3915 // slice. Note that we leave the original load in place unless this store
3916 // was its ownly use. It may in turn be split up if it is an alloca load
3917 // for some other alloca, but it may be a normal load. This may introduce
3918 // redundant loads, but where those can be merged the rest of the optimizer
3919 // should handle the merging, and this uncovers SSA splits which is more
3920 // important. In practice, the original loads will almost always be fully
3921 // split and removed eventually, and the splits will be merged by any
3922 // trivial CSE, including instcombine.
3923 if (LI
->hasOneUse()) {
3924 assert(*LI
->user_begin() == SI
&& "Single use isn't this store!");
3925 DeadInsts
.insert(LI
);
3927 DeadInsts
.insert(SI
);
3931 // Remove the killed slices that have ben pre-split.
3932 AS
.erase(std::remove_if(AS
.begin(), AS
.end(), [](const Slice
&S
) {
3936 // Insert our new slices. This will sort and merge them into the sorted
3938 AS
.insert(NewSlices
);
3940 DEBUG(dbgs() << " Pre-split slices:\n");
3942 for (auto I
= AS
.begin(), E
= AS
.end(); I
!= E
; ++I
)
3943 DEBUG(AS
.print(dbgs(), I
, " "));
3946 // Finally, don't try to promote any allocas that new require re-splitting.
3947 // They have already been added to the worklist above.
3948 PromotableAllocas
.erase(
3950 PromotableAllocas
.begin(), PromotableAllocas
.end(),
3951 [&](AllocaInst
*AI
) { return ResplitPromotableAllocas
.count(AI
); }),
3952 PromotableAllocas
.end());
3957 /// \brief Rewrite an alloca partition's users.
3959 /// This routine drives both of the rewriting goals of the SROA pass. It tries
3960 /// to rewrite uses of an alloca partition to be conducive for SSA value
3961 /// promotion. If the partition needs a new, more refined alloca, this will
3962 /// build that new alloca, preserving as much type information as possible, and
3963 /// rewrite the uses of the old alloca to point at the new one and have the
3964 /// appropriate new offsets. It also evaluates how successful the rewrite was
3965 /// at enabling promotion and if it was successful queues the alloca to be
3967 bool SROA::rewritePartition(AllocaInst
&AI
, AllocaSlices
&AS
,
3968 AllocaSlices::Partition
&P
) {
3969 // Try to compute a friendly type for this partition of the alloca. This
3970 // won't always succeed, in which case we fall back to a legal integer type
3971 // or an i8 array of an appropriate size.
3972 Type
*SliceTy
= nullptr;
3973 if (Type
*CommonUseTy
= findCommonType(P
.begin(), P
.end(), P
.endOffset()))
3974 if (DL
->getTypeAllocSize(CommonUseTy
) >= P
.size())
3975 SliceTy
= CommonUseTy
;
3977 if (Type
*TypePartitionTy
= getTypePartition(*DL
, AI
.getAllocatedType(),
3978 P
.beginOffset(), P
.size()))
3979 SliceTy
= TypePartitionTy
;
3980 if ((!SliceTy
|| (SliceTy
->isArrayTy() &&
3981 SliceTy
->getArrayElementType()->isIntegerTy())) &&
3982 DL
->isLegalInteger(P
.size() * 8))
3983 SliceTy
= Type::getIntNTy(*C
, P
.size() * 8);
3985 SliceTy
= ArrayType::get(Type::getInt8Ty(*C
), P
.size());
3986 assert(DL
->getTypeAllocSize(SliceTy
) >= P
.size());
3988 bool IsIntegerPromotable
= isIntegerWideningViable(P
, SliceTy
, *DL
);
3991 IsIntegerPromotable
? nullptr : isVectorPromotionViable(P
, *DL
);
3995 // Check for the case where we're going to rewrite to a new alloca of the
3996 // exact same type as the original, and with the same access offsets. In that
3997 // case, re-use the existing alloca, but still run through the rewriter to
3998 // perform phi and select speculation.
4000 if (SliceTy
== AI
.getAllocatedType()) {
4001 assert(P
.beginOffset() == 0 &&
4002 "Non-zero begin offset but same alloca type");
4004 // FIXME: We should be able to bail at this point with "nothing changed".
4005 // FIXME: We might want to defer PHI speculation until after here.
4007 unsigned Alignment
= AI
.getAlignment();
4009 // The minimum alignment which users can rely on when the explicit
4010 // alignment is omitted or zero is that required by the ABI for this
4012 Alignment
= DL
->getABITypeAlignment(AI
.getAllocatedType());
4014 Alignment
= MinAlign(Alignment
, P
.beginOffset());
4015 // If we will get at least this much alignment from the type alone, leave
4016 // the alloca's alignment unconstrained.
4017 if (Alignment
<= DL
->getABITypeAlignment(SliceTy
))
4019 NewAI
= new AllocaInst(
4020 SliceTy
, nullptr, Alignment
,
4021 AI
.getName() + ".sroa." + Twine(P
.begin() - AS
.begin()), &AI
);
4025 DEBUG(dbgs() << "Rewriting alloca partition "
4026 << "[" << P
.beginOffset() << "," << P
.endOffset()
4027 << ") to: " << *NewAI
<< "\n");
4029 // Track the high watermark on the worklist as it is only relevant for
4030 // promoted allocas. We will reset it to this point if the alloca is not in
4031 // fact scheduled for promotion.
4032 unsigned PPWOldSize
= PostPromotionWorklist
.size();
4033 unsigned NumUses
= 0;
4034 SmallPtrSet
<PHINode
*, 8> PHIUsers
;
4035 SmallPtrSet
<SelectInst
*, 8> SelectUsers
;
4037 AllocaSliceRewriter
Rewriter(*DL
, AS
, *this, AI
, *NewAI
, P
.beginOffset(),
4038 P
.endOffset(), IsIntegerPromotable
, VecTy
,
4039 PHIUsers
, SelectUsers
);
4040 bool Promotable
= true;
4041 for (Slice
*S
: P
.splitSliceTails()) {
4042 Promotable
&= Rewriter
.visit(S
);
4045 for (Slice
&S
: P
) {
4046 Promotable
&= Rewriter
.visit(&S
);
4050 NumAllocaPartitionUses
+= NumUses
;
4051 MaxUsesPerAllocaPartition
=
4052 std::max
<unsigned>(NumUses
, MaxUsesPerAllocaPartition
);
4054 // Now that we've processed all the slices in the new partition, check if any
4055 // PHIs or Selects would block promotion.
4056 for (SmallPtrSetImpl
<PHINode
*>::iterator I
= PHIUsers
.begin(),
4059 if (!isSafePHIToSpeculate(**I
, DL
)) {
4062 SelectUsers
.clear();
4065 for (SmallPtrSetImpl
<SelectInst
*>::iterator I
= SelectUsers
.begin(),
4066 E
= SelectUsers
.end();
4068 if (!isSafeSelectToSpeculate(**I
, DL
)) {
4071 SelectUsers
.clear();
4076 if (PHIUsers
.empty() && SelectUsers
.empty()) {
4077 // Promote the alloca.
4078 PromotableAllocas
.push_back(NewAI
);
4080 // If we have either PHIs or Selects to speculate, add them to those
4081 // worklists and re-queue the new alloca so that we promote in on the
4083 for (PHINode
*PHIUser
: PHIUsers
)
4084 SpeculatablePHIs
.insert(PHIUser
);
4085 for (SelectInst
*SelectUser
: SelectUsers
)
4086 SpeculatableSelects
.insert(SelectUser
);
4087 Worklist
.insert(NewAI
);
4090 // If we can't promote the alloca, iterate on it to check for new
4091 // refinements exposed by splitting the current alloca. Don't iterate on an
4092 // alloca which didn't actually change and didn't get promoted.
4094 Worklist
.insert(NewAI
);
4096 // Drop any post-promotion work items if promotion didn't happen.
4097 while (PostPromotionWorklist
.size() > PPWOldSize
)
4098 PostPromotionWorklist
.pop_back();
4104 /// \brief Walks the slices of an alloca and form partitions based on them,
4105 /// rewriting each of their uses.
4106 bool SROA::splitAlloca(AllocaInst
&AI
, AllocaSlices
&AS
) {
4107 if (AS
.begin() == AS
.end())
4110 unsigned NumPartitions
= 0;
4111 bool Changed
= false;
4113 // First try to pre-split loads and stores.
4114 Changed
|= presplitLoadsAndStores(AI
, AS
);
4116 // Now that we have identified any pre-splitting opportunities, mark any
4117 // splittable (non-whole-alloca) loads and stores as unsplittable. If we fail
4118 // to split these during pre-splitting, we want to force them to be
4119 // rewritten into a partition.
4120 bool IsSorted
= true;
4121 for (Slice
&S
: AS
) {
4122 if (!S
.isSplittable())
4124 // FIXME: We currently leave whole-alloca splittable loads and stores. This
4125 // used to be the only splittable loads and stores and we need to be
4126 // confident that the above handling of splittable loads and stores is
4127 // completely sufficient before we forcibly disable the remaining handling.
4128 if (S
.beginOffset() == 0 &&
4129 S
.endOffset() >= DL
->getTypeAllocSize(AI
.getAllocatedType()))
4131 if (isa
<LoadInst
>(S
.getUse()->getUser()) ||
4132 isa
<StoreInst
>(S
.getUse()->getUser())) {
4133 S
.makeUnsplittable();
4138 std::sort(AS
.begin(), AS
.end());
4140 // Rewrite each partition.
4141 for (auto &P
: AS
.partitions()) {
4142 Changed
|= rewritePartition(AI
, AS
, P
);
4146 NumAllocaPartitions
+= NumPartitions
;
4147 MaxPartitionsPerAlloca
=
4148 std::max
<unsigned>(NumPartitions
, MaxPartitionsPerAlloca
);
4153 /// \brief Clobber a use with undef, deleting the used value if it becomes dead.
4154 void SROA::clobberUse(Use
&U
) {
4156 // Replace the use with an undef value.
4157 U
= UndefValue::get(OldV
->getType());
4159 // Check for this making an instruction dead. We have to garbage collect
4160 // all the dead instructions to ensure the uses of any alloca end up being
4162 if (Instruction
*OldI
= dyn_cast
<Instruction
>(OldV
))
4163 if (isInstructionTriviallyDead(OldI
)) {
4164 DeadInsts
.insert(OldI
);
4168 /// \brief Analyze an alloca for SROA.
4170 /// This analyzes the alloca to ensure we can reason about it, builds
4171 /// the slices of the alloca, and then hands it off to be split and
4172 /// rewritten as needed.
4173 bool SROA::runOnAlloca(AllocaInst
&AI
) {
4174 DEBUG(dbgs() << "SROA alloca: " << AI
<< "\n");
4175 ++NumAllocasAnalyzed
;
4177 // Special case dead allocas, as they're trivial.
4178 if (AI
.use_empty()) {
4179 AI
.eraseFromParent();
4183 // Skip alloca forms that this analysis can't handle.
4184 if (AI
.isArrayAllocation() || !AI
.getAllocatedType()->isSized() ||
4185 DL
->getTypeAllocSize(AI
.getAllocatedType()) == 0)
4188 bool Changed
= false;
4190 // First, split any FCA loads and stores touching this alloca to promote
4191 // better splitting and promotion opportunities.
4192 AggLoadStoreRewriter
AggRewriter(*DL
);
4193 Changed
|= AggRewriter
.rewrite(AI
);
4195 // Build the slices using a recursive instruction-visiting builder.
4196 AllocaSlices
AS(*DL
, AI
);
4197 DEBUG(AS
.print(dbgs()));
4201 // Delete all the dead users of this alloca before splitting and rewriting it.
4202 for (Instruction
*DeadUser
: AS
.getDeadUsers()) {
4203 // Free up everything used by this instruction.
4204 for (Use
&DeadOp
: DeadUser
->operands())
4207 // Now replace the uses of this instruction.
4208 DeadUser
->replaceAllUsesWith(UndefValue::get(DeadUser
->getType()));
4210 // And mark it for deletion.
4211 DeadInsts
.insert(DeadUser
);
4214 for (Use
*DeadOp
: AS
.getDeadOperands()) {
4215 clobberUse(*DeadOp
);
4219 // No slices to split. Leave the dead alloca for a later pass to clean up.
4220 if (AS
.begin() == AS
.end())
4223 Changed
|= splitAlloca(AI
, AS
);
4225 DEBUG(dbgs() << " Speculating PHIs\n");
4226 while (!SpeculatablePHIs
.empty())
4227 speculatePHINodeLoads(*SpeculatablePHIs
.pop_back_val());
4229 DEBUG(dbgs() << " Speculating Selects\n");
4230 while (!SpeculatableSelects
.empty())
4231 speculateSelectInstLoads(*SpeculatableSelects
.pop_back_val());
4236 /// \brief Delete the dead instructions accumulated in this run.
4238 /// Recursively deletes the dead instructions we've accumulated. This is done
4239 /// at the very end to maximize locality of the recursive delete and to
4240 /// minimize the problems of invalidated instruction pointers as such pointers
4241 /// are used heavily in the intermediate stages of the algorithm.
4243 /// We also record the alloca instructions deleted here so that they aren't
4244 /// subsequently handed to mem2reg to promote.
4245 void SROA::deleteDeadInstructions(
4246 SmallPtrSetImpl
<AllocaInst
*> &DeletedAllocas
) {
4247 while (!DeadInsts
.empty()) {
4248 Instruction
*I
= DeadInsts
.pop_back_val();
4249 DEBUG(dbgs() << "Deleting dead instruction: " << *I
<< "\n");
4251 I
->replaceAllUsesWith(UndefValue::get(I
->getType()));
4253 for (Use
&Operand
: I
->operands())
4254 if (Instruction
*U
= dyn_cast
<Instruction
>(Operand
)) {
4255 // Zero out the operand and see if it becomes trivially dead.
4257 if (isInstructionTriviallyDead(U
))
4258 DeadInsts
.insert(U
);
4261 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(I
))
4262 DeletedAllocas
.insert(AI
);
4265 I
->eraseFromParent();
4269 static void enqueueUsersInWorklist(Instruction
&I
,
4270 SmallVectorImpl
<Instruction
*> &Worklist
,
4271 SmallPtrSetImpl
<Instruction
*> &Visited
) {
4272 for (User
*U
: I
.users())
4273 if (Visited
.insert(cast
<Instruction
>(U
)).second
)
4274 Worklist
.push_back(cast
<Instruction
>(U
));
4277 /// \brief Promote the allocas, using the best available technique.
4279 /// This attempts to promote whatever allocas have been identified as viable in
4280 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
4281 /// If there is a domtree available, we attempt to promote using the full power
4282 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
4283 /// based on the SSAUpdater utilities. This function returns whether any
4284 /// promotion occurred.
4285 bool SROA::promoteAllocas(Function
&F
) {
4286 if (PromotableAllocas
.empty())
4289 NumPromoted
+= PromotableAllocas
.size();
4291 if (DT
&& !ForceSSAUpdater
) {
4292 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
4293 PromoteMemToReg(PromotableAllocas
, *DT
, nullptr, AC
);
4294 PromotableAllocas
.clear();
4298 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
4300 DIBuilder
DIB(*F
.getParent(), /*AllowUnresolved*/ false);
4301 SmallVector
<Instruction
*, 64> Insts
;
4303 // We need a worklist to walk the uses of each alloca.
4304 SmallVector
<Instruction
*, 8> Worklist
;
4305 SmallPtrSet
<Instruction
*, 8> Visited
;
4306 SmallVector
<Instruction
*, 32> DeadInsts
;
4308 for (unsigned Idx
= 0, Size
= PromotableAllocas
.size(); Idx
!= Size
; ++Idx
) {
4309 AllocaInst
*AI
= PromotableAllocas
[Idx
];
4314 enqueueUsersInWorklist(*AI
, Worklist
, Visited
);
4316 while (!Worklist
.empty()) {
4317 Instruction
*I
= Worklist
.pop_back_val();
4319 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
4320 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
4321 // leading to them) here. Eventually it should use them to optimize the
4322 // scalar values produced.
4323 if (IntrinsicInst
*II
= dyn_cast
<IntrinsicInst
>(I
)) {
4324 assert(II
->getIntrinsicID() == Intrinsic::lifetime_start
||
4325 II
->getIntrinsicID() == Intrinsic::lifetime_end
);
4326 II
->eraseFromParent();
4330 // Push the loads and stores we find onto the list. SROA will already
4331 // have validated that all loads and stores are viable candidates for
4333 if (LoadInst
*LI
= dyn_cast
<LoadInst
>(I
)) {
4334 assert(LI
->getType() == AI
->getAllocatedType());
4335 Insts
.push_back(LI
);
4338 if (StoreInst
*SI
= dyn_cast
<StoreInst
>(I
)) {
4339 assert(SI
->getValueOperand()->getType() == AI
->getAllocatedType());
4340 Insts
.push_back(SI
);
4344 // For everything else, we know that only no-op bitcasts and GEPs will
4345 // make it this far, just recurse through them and recall them for later
4347 DeadInsts
.push_back(I
);
4348 enqueueUsersInWorklist(*I
, Worklist
, Visited
);
4350 AllocaPromoter(Insts
, SSA
, *AI
, DIB
).run(Insts
);
4351 while (!DeadInsts
.empty())
4352 DeadInsts
.pop_back_val()->eraseFromParent();
4353 AI
->eraseFromParent();
4356 PromotableAllocas
.clear();
4360 bool SROA::runOnFunction(Function
&F
) {
4361 if (skipOptnoneFunction(F
))
4364 DEBUG(dbgs() << "SROA function: " << F
.getName() << "\n");
4365 C
= &F
.getContext();
4366 DataLayoutPass
*DLP
= getAnalysisIfAvailable
<DataLayoutPass
>();
4368 DEBUG(dbgs() << " Skipping SROA -- no target data!\n");
4371 DL
= &DLP
->getDataLayout();
4372 DominatorTreeWrapperPass
*DTWP
=
4373 getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
4374 DT
= DTWP
? &DTWP
->getDomTree() : nullptr;
4375 AC
= &getAnalysis
<AssumptionCacheTracker
>().getAssumptionCache(F
);
4377 BasicBlock
&EntryBB
= F
.getEntryBlock();
4378 for (BasicBlock::iterator I
= EntryBB
.begin(), E
= std::prev(EntryBB
.end());
4380 if (AllocaInst
*AI
= dyn_cast
<AllocaInst
>(I
))
4381 Worklist
.insert(AI
);
4383 bool Changed
= false;
4384 // A set of deleted alloca instruction pointers which should be removed from
4385 // the list of promotable allocas.
4386 SmallPtrSet
<AllocaInst
*, 4> DeletedAllocas
;
4389 while (!Worklist
.empty()) {
4390 Changed
|= runOnAlloca(*Worklist
.pop_back_val());
4391 deleteDeadInstructions(DeletedAllocas
);
4393 // Remove the deleted allocas from various lists so that we don't try to
4394 // continue processing them.
4395 if (!DeletedAllocas
.empty()) {
4396 auto IsInSet
= [&](AllocaInst
*AI
) { return DeletedAllocas
.count(AI
); };
4397 Worklist
.remove_if(IsInSet
);
4398 PostPromotionWorklist
.remove_if(IsInSet
);
4399 PromotableAllocas
.erase(std::remove_if(PromotableAllocas
.begin(),
4400 PromotableAllocas
.end(),
4402 PromotableAllocas
.end());
4403 DeletedAllocas
.clear();
4407 Changed
|= promoteAllocas(F
);
4409 Worklist
= PostPromotionWorklist
;
4410 PostPromotionWorklist
.clear();
4411 } while (!Worklist
.empty());
4416 void SROA::getAnalysisUsage(AnalysisUsage
&AU
) const {
4417 AU
.addRequired
<AssumptionCacheTracker
>();
4418 if (RequiresDomTree
)
4419 AU
.addRequired
<DominatorTreeWrapperPass
>();
4420 AU
.setPreservesCFG();