From 4e063ed79073db778e7ddbbadd53b21798b2de25 Mon Sep 17 00:00:00 2001 From: bcheng0127 Date: Tue, 2 Sep 2025 18:09:21 +0000 Subject: [PATCH 1/5] Changes in code. (cherry picked from commit 6ee64d4cbb20915f4f86a4d8ad3ef479d0138c42) --- .../PromoteToPredicatedMemoryAccess/LoadSubDW.cl | 2 ++ visa/LocalScheduler/Dependencies_G4IR.cpp | 9 --------- visa/LocalScheduler/Dependencies_G4IR.h | 2 -- visa/LocalScheduler/LocalScheduler_G4IR.cpp | 14 +------------- 4 files changed, 3 insertions(+), 24 deletions(-) diff --git a/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl b/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl index c2ede424bc56..805d278a40c3 100644 --- a/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl +++ b/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl @@ -13,6 +13,8 @@ SPDX-License-Identifier: MIT // RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'EnablePromoteToPredicatedMemoryAccess=1 VISAOptions=-asmToConsole'" 2>&1 | FileCheck %s --check-prefixes=CHECK-ASM // CHECK-ASM: kernel test_i8_0 +// CHECK-ASM: cmp (32|M0) (lt)[[F1:f[0-9\.]+]] +// CHECK-ASM: ([[F1]]) goto (32|M0) // CHECK-ASM: {{[_a-z0-9A-Z]+}}: // CHECK-ASM-DAG: cmp (32|M0) (le)[[F2:f[0-9\.]+]] null<1>:d r{{[0-9\.]+}} // CHECK-ASM-DAG: cmp (32|M0) (ge)[[F3:f[0-9\.]+]] null<1>:d r{{[0-9\.]+}} diff --git a/visa/LocalScheduler/Dependencies_G4IR.cpp b/visa/LocalScheduler/Dependencies_G4IR.cpp index 9cc823f884ad..3f2181802d66 100644 --- a/visa/LocalScheduler/Dependencies_G4IR.cpp +++ b/visa/LocalScheduler/Dependencies_G4IR.cpp @@ -189,15 +189,6 @@ DepType vISA::getDepScratchSend(G4_INST *curInst, G4_INST *liveInst) { return NODEP; } -bool vISA::isNotLatencyBarrier (DepType type) { - if (type == CONTROL_FLOW_BARRIER || - type == OPT_BARRIER || - type == SEND_BARRIER || - type == MSG_BARRIER) - return true; - return false; -} - DepType vISA::CheckBarrier(G4_INST *inst) { if (inst->isOptBarrier() || inst->isAtomicInst() || inst->opcode() == G4_madm) { return OPT_BARRIER; diff --git a/visa/LocalScheduler/Dependencies_G4IR.h b/visa/LocalScheduler/Dependencies_G4IR.h index eec65ba3112a..5e6587ceaee5 100644 --- a/visa/LocalScheduler/Dependencies_G4IR.h +++ b/visa/LocalScheduler/Dependencies_G4IR.h @@ -38,8 +38,6 @@ DepType getDepSend(G4_INST *curInst, G4_INST *liveInst, bool BTIIsRestrict); DepType getDepScratchSend(G4_INST *curInst, G4_INST *liveInst); -bool isNotLatencyBarrier(DepType type); - DepType CheckBarrier(G4_INST *inst); DepType getDepForOpnd(Gen4_Operand_Number cur, Gen4_Operand_Number liv); diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.cpp b/visa/LocalScheduler/LocalScheduler_G4IR.cpp index 7af545b7403b..ccb766c6ec7d 100644 --- a/visa/LocalScheduler/LocalScheduler_G4IR.cpp +++ b/visa/LocalScheduler/LocalScheduler_G4IR.cpp @@ -1887,21 +1887,9 @@ void DDD::collectRoots() { void DDD::setPriority(Node *pred, const Edge &edge) { // Calculate PRED's priority (pred->priority), based on SUCC's priority Node *succ = edge.getNode(); - DepType type = edge.getType(); vISA_ASSERT(succ->priority != Node::PRIORITY_UNINIT, "succ node has no priority?"); - int newPriority = succ->priority; - // Note that, node->isBarrier cannot be used here. Because there may be - // non-barrier dep. - bool isSend = false; - if (!pred->getInstructions()->empty()) { - isSend = pred->getInstructions()->front()->isSend(); - } - if (isNotLatencyBarrier(type) && - !isSend) // send has long latency, it's better be scheduled to the front. - newPriority += pred->getOccupancy(); - else - newPriority += edge.getLatency(); + int newPriority = succ->priority + edge.getLatency(); pred->priority = (newPriority > pred->priority) ? newPriority : pred->priority; } From 5fff125e4873fcf5b5d40442e5837c35e6d97388 Mon Sep 17 00:00:00 2001 From: "Dmitrichenko, Aleksei" Date: Mon, 18 Aug 2025 13:04:32 +0000 Subject: [PATCH 2/5] Enable CodeScheduling Improve CodeScheduling and enable it by default - Support handling of the remated instructions - Various heuristics added to handle situations with small (splitted) loads - Heuristic to populate the same vector added --- IGC/Compiler/CISACodeGen/CodeScheduling.cpp | 315 +++++++++++++++--- .../CISACodeGen/CodeSchedulingOptionsDef.h | 18 + .../CISACodeGen/RematChainsAnalysis.cpp | 15 +- IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp | 3 +- .../CodeScheduling/remat-inst-handling.ll | 45 ++- IGC/common/igc_flags.h | 2 +- 6 files changed, 339 insertions(+), 59 deletions(-) diff --git a/IGC/Compiler/CISACodeGen/CodeScheduling.cpp b/IGC/Compiler/CISACodeGen/CodeScheduling.cpp index 96ac483ebe1b..0b5f47fcab9c 100644 --- a/IGC/Compiler/CISACodeGen/CodeScheduling.cpp +++ b/IGC/Compiler/CISACodeGen/CodeScheduling.cpp @@ -465,6 +465,21 @@ class RegisterPressureTracker { return V; } + DenseSet getHangingS2VInstructions() { + // return all the vectors that are created of scalars, but not fully populated yet + DenseSet HangingInstructions; + for (const auto &HangingLiveVar : HangingLiveVarsVec) { + if (HangingLiveVar->Type == HangingLiveVarsType::HANGING_SCALARS_TO_VECTOR) { + for (auto *V : HangingLiveVar->LiveVars) { + if (Instruction *I = dyn_cast(V)) { + HangingInstructions.insert(I); + } + } + } + } + return HangingInstructions; + } + private: BasicBlock *BB; Function *F; @@ -492,7 +507,7 @@ class RegisterPressureTracker { llvm::DenseMap> RealUsesCache; llvm::DenseMap, int32_t> ValueSizeCache; - typedef enum { HANGING_SCALARS, HANGING_VECTORS, HANGING_NOOP_VECTORS } HangingLiveVarsType; + typedef enum { HANGING_SCALARS_TO_VECTOR, HANGING_VECTOR_TO_SCALARS, HANGING_VECTORS, HANGING_NOOP_VECTORS } HangingLiveVarsType; // POD structure to keep information about hanging values struct HangingLiveVarsInfo { @@ -564,7 +579,7 @@ class RegisterPressureTracker { if (RCA && !Update) { RematChainPattern *RCP = RCA->getRematChainPattern(I); if (RCP && (RCP->getFirstInst() == I)) { - // if it's a remat chain we are going to use the remat target instruction (usually load or store) + // if it's a remat chain we are going to use the remat target instruction (if it's load or store) Instruction *TargetInst = RCP->getRematTargetInst(); return estimateOrUpdateImpl(TargetInst, false); } @@ -707,7 +722,7 @@ class RegisterPressureTracker { auto *FirstIE = DTI->getFirstIE(); auto *FirstScalar = FirstIE->getOperand(1); if (!HangingLiveVars.count(FirstScalar)) { - HangingLiveVarsVec.emplace_back(std::make_unique(0, HANGING_SCALARS)); + HangingLiveVarsVec.emplace_back(std::make_unique(0, HANGING_SCALARS_TO_VECTOR)); auto *HLV = HangingLiveVarsVec.back().get(); for (Value *V : DTI->getSourceScalars()) { @@ -749,7 +764,7 @@ class RegisterPressureTracker { if (!HangingLiveVars.count(I)) { IGC_ASSERT(V2SP->getSourceVec() == EE->getVectorOperand()); HangingLiveVarsVec.emplace_back(std::make_unique( - computeSizeInBytes(V2SP->getSourceVec(), SIMD, WI, *DL), HANGING_SCALARS)); + computeSizeInBytes(V2SP->getSourceVec(), SIMD, WI, *DL), HANGING_VECTOR_TO_SCALARS)); auto *HLV = HangingLiveVarsVec.back().get(); for (Value *V : V2SP->getEEs()) { IGC_ASSERT(!HLV->LiveVars.count(V)); @@ -851,7 +866,8 @@ class RegisterPressureTracker { { if (Update) PrintDumpLevel(VerbosityLevel::High, " (hanging vector dies)"); - if (HLV->Type == HANGING_SCALARS) { + if (HLV->Type == HANGING_SCALARS_TO_VECTOR || + HLV->Type == HANGING_VECTOR_TO_SCALARS) { // only scalars die RPDecrease = HLV->Size; } else { @@ -864,7 +880,8 @@ class RegisterPressureTracker { " (hanging vector, left vars: " << (HLV->LiveVars.count(RealOp) ? HLV->LiveVars.size() - 1 : HLV->LiveVars.size()) << ")"); - if (HLV->Type == HANGING_SCALARS) { + if (HLV->Type == HANGING_SCALARS_TO_VECTOR || + HLV->Type == HANGING_VECTOR_TO_SCALARS) { RPDecrease = 0; // We don't decrease pressure, because the vector is still alive } } @@ -1356,7 +1373,7 @@ class BBScheduler { AdditionalWeight; } case GenISAIntrinsic::GenISA_WaveAll: - return C[Option::WeightWaveAllDstDep]; + return HighRP ? C[Option::WeightWaveAllDstDepHighRP] : C[Option::WeightWaveAllDstDep]; default: break; } @@ -1830,10 +1847,15 @@ class BBScheduler { // Sort in ascending order using RT->estimate(Node->I) as a key std::sort(Nodes.begin(), Nodes.end(), [&](InstructionNode *A, InstructionNode *B) { return RT.estimate(A->I) < RT.estimate(B->I); }); - auto LowestRP = RT.estimate(Nodes.front()->I); + int32_t LowestRP = RT.estimate(Nodes.front()->I); InstNodePtrList LowestRPNodes; + if (C[Option::AllowLargerRPWindowRPThreshold] > 0 && + LowestRP >= static_cast(C[Option::AllowLargerRPWindowRPThreshold])) { + // If the lowest RP is larger than the threshold, we can allow larger RP window + LowestRP += static_cast(C[Option::AllowLargerRPWindowSize]); + } for (InstructionNode *Node : Nodes) { - if (RT.estimate(Node->I) == LowestRP) { + if (RT.estimate(Node->I) <= LowestRP) { LowestRPNodes.push_back(Node); } else { break; @@ -1899,42 +1921,67 @@ class BBScheduler { return Nodes; }; - auto getLoadsThatUnlockDPASes = [&](InstNodePtrList &Nodes, uint MaxLoadSize) -> InstNodePtrList & { - std::function(Instruction *)> getRealUsesThroughVS; - getRealUsesThroughVS = [&](Instruction *I) -> llvm::DenseSet { - llvm::DenseSet Uses; - - std::function collectUses = [&](Value *V) { - for (auto *U : RT.getRealUses(V)) { - auto *DV = VSA->getDestVector(U); - if (DV && DV->isVectorShuffle()) { - collectUses(DV->getLastIE()); - } else { - Uses.insert(U); - } - } - }; + auto getRealOpThroughVS = [&](Instruction *I) -> Instruction * { + Instruction *OpI = dyn_cast(RT.getRealOp(I)); + if (!OpI) { + return nullptr; + } + auto *DV = VSA->getDestVector(OpI); + if (DV && DV->isVectorShuffle()) { + auto *SourceVec = dyn_cast(DV->getSourceVec()); + if (!SourceVec) { + return nullptr; + } + return dyn_cast(RT.getRealOp(SourceVec)); + } + return OpI; + }; - collectUses(I); - return Uses; - }; + std::function(Instruction *)> getRealUsesThroughVS; + getRealUsesThroughVS = [&](Instruction *I) -> llvm::DenseSet { + llvm::DenseSet Uses; - auto getRealOpThroughVS = [&](Instruction *I) -> Instruction * { - Instruction *OpI = dyn_cast(RT.getRealOp(I)); - if (!OpI) { - return nullptr; + std::function collectUses = [&](Value *V) { + for (auto *U : RT.getRealUses(V)) { + auto *DV = VSA->getDestVector(U); + if (DV && DV->isVectorShuffle()) { + collectUses(DV->getLastIE()); + } else { + Uses.insert(U); + } } - auto *DV = VSA->getDestVector(OpI); - if (DV && DV->isVectorShuffle()) { - auto *SourceVec = dyn_cast(DV->getSourceVec()); - if (!SourceVec) { - return nullptr; + }; + + collectUses(I); + return Uses; + }; + + std::function(Instruction *)> getRealUsesThroughRematChains; + getRealUsesThroughRematChains = [&](Instruction *I) -> llvm::DenseSet { + llvm::DenseSet Uses; + + std::function collectUses = [&](Value *V) { + for (auto *U : RT.getRealUses(V)) { + auto *RematChainPattern = RCA->getRematChainPattern(U); + if (RematChainPattern) { + // If the use is a remat chain, collect the last instruction in the chain + Uses.insert(RematChainPattern->getRematTargetInst()); + } else { + Uses.insert(U); } - return dyn_cast(RT.getRealOp(SourceVec)); } - return OpI; }; + collectUses(I); + return Uses; + }; + + auto getLoadsThatUnlockDPASes = [&](InstNodePtrList &Nodes, uint MaxLoadSize) -> InstNodePtrList & { + // We first prioritize the DPASes that don't increase regpressure + // if there are loads that unlock these DPASes - filter out all ther instructions + // But if there are no DPASes that don't increase regpressure + // - we can also consider the ones that do increase + auto getLoadWidth = [&](Instruction *I) -> uint { if (GenIntrinsicInst *Intr = dyn_cast(I)) { if (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead || @@ -1949,6 +1996,8 @@ class BBScheduler { }; InstNodePtrList LoadsThatUnlockDPASes; + InstNodePtrList LoadsThatUnlockDPASesNoRPIncreasing; + for (InstructionNode *Node : Nodes) { if (!is2dBlockRead(Node->I) || getLoadWidth(Node->I) > MaxLoadSize) { continue; @@ -1962,6 +2011,13 @@ class BBScheduler { if (isDPAS(I)) { bool OneOpIsDPAS = false; + bool FirstOpIsZero = false; + + auto *FirstOp = dyn_cast(I->getOperand(0)); + if (FirstOp && (isa(FirstOp) || FirstOp->isNullValue())) { + FirstOpIsZero = true; + } + int NumOps = static_cast(I->getNumOperands()); for (auto &Op : I->operands()) { Instruction *OpI = dyn_cast(Op.get()); @@ -1980,12 +2036,18 @@ class BBScheduler { } if (NumOps == 0) { LoadsThatUnlockDPASes.push_back(Node); + if (!FirstOpIsZero) { + LoadsThatUnlockDPASesNoRPIncreasing.push_back(Node); + } break; } } } } - if (LoadsThatUnlockDPASes.size() > 0) { + + if (LoadsThatUnlockDPASesNoRPIncreasing.size() > 0) { + Nodes = std::move(LoadsThatUnlockDPASesNoRPIncreasing); + } else if (LoadsThatUnlockDPASes.size() > 0) { Nodes = std::move(LoadsThatUnlockDPASes); } return Nodes; @@ -2050,6 +2112,161 @@ class BBScheduler { return Nodes; }; + auto filterOutNotReadyIcmp = [&](InstNodePtrList &Nodes) -> InstNodePtrList & { + // Heuristic in order not to put ICMP that is used by a select too early. + // Schedule it only when the select is ready + + InstNodePtrList NonFilteredNodes; + for (InstructionNode *Node : Nodes) { + if (isa(Node->I)) { + bool IsReady = true; + User *U = IGCLLVM::getUniqueUndroppableUser(Node->I); + if (!U) { + NonFilteredNodes.push_back(Node); + continue; + } + SelectInst *SI = dyn_cast(U); + if (!SI) { + NonFilteredNodes.push_back(Node); + continue; + } + // If the select instruction is not ready, we need to filter out the icmp instruction + InstructionNode *SelectNode = G.InstToNode[SI]; + for (const auto &PN : SelectNode->Preds) { + if (PN->Src->I == Node->I) { + continue; + } + if (isa(PN->Src->I) || isa(PN->Src->I)) { + continue; + } + Instruction *OpI = dyn_cast(PN->Src->I); + if (!OpI) { + continue; + } + + if (!RT.inBBCurrent(OpI)) { + // if the instruction is in BBCurrent, then it is ready + IsReady = false; + break; + } + } + if (IsReady) { + NonFilteredNodes.push_back(Node); + } + // else it's filtered out, until the operand of the select is ready + } + else { + NonFilteredNodes.push_back(Node); + } + } + if (NonFilteredNodes.size() > 0) { + Nodes = std::move(NonFilteredNodes); + } + return Nodes; + }; + + auto focusLoadsOnOneDPAS = [&](InstNodePtrList &Nodes) -> InstNodePtrList & { + // If all Nodes are 2d block loads, choose the dpas user with the lowest initial number and filter out + // all the remaining loads. This is needed to avoid a situation when we schedule a lot of small loads first, + // but all the DPASes wait for some load that is in the end + if (Nodes.size() == 1) { + return Nodes; + } + + InstNodePtrList NonFilteredNodes; + if (std::all_of(Nodes.begin(), Nodes.end(), + [&](InstructionNode *Node) { return is2dBlockRead(Node->I); })) { + + // Get the first DPAS user + InstructionNode *FirstDPASUser = nullptr; + for (InstructionNode *Node : Nodes) { + for (auto *U : getRealUsesThroughVS(Node->I)) { + auto *I = dyn_cast(U); + if (!I) { + continue; + } + + if (isDPAS(I)) { + if (!FirstDPASUser || (G.InstToNode[I]->OriginalPosition < FirstDPASUser->OriginalPosition)) { + FirstDPASUser = G.InstToNode[I]; + + NonFilteredNodes = {Node}; + } else if (G.InstToNode[I] == FirstDPASUser) { + NonFilteredNodes.push_back(Node); + } + } + } + } + + if (NonFilteredNodes.size() > 0) { + Nodes = std::move(NonFilteredNodes); + } + } + + return Nodes; + }; + + auto filterOutNotUnblockingExistingVectorInst = [&](InstNodePtrList &Nodes) -> InstNodePtrList & { + // If some values are currently hanging because of creating a vector instruction out of scalars + // we prioritize the candidates that unblock the other elements of the vector + + // This helps to resolve the issue when we schedule several IEs to the 0th element of different vectors + // increasing the regpressure, because the GRF space for the other elements is immediately reserved + // but the vectors are not fully populated and we can't use them + + DenseSet HangingElements = RT.getHangingS2VInstructions(); + if (HangingElements.empty()) { + // If there are no hanging elements, we don't need to filter out anything + return Nodes; + } + + InstNodePtrList NonFilteredNodes; + for (InstructionNode *Node : Nodes) { + if (HangingElements.count(Node->I) > 0) { + // If the instruction is already hanging, we don't need to filter it out + NonFilteredNodes.push_back(Node); + continue; + } + for (Value *V : getRealUsesThroughRematChains(Node->I)) { + if (Instruction *I = dyn_cast(V)) { + if (HangingElements.count(I) > 0) { + NonFilteredNodes.push_back(Node); + break; // No need to check other uses, we already found a use that unblocks the vector + } + } + } + } + if (NonFilteredNodes.size() > 0) { + Nodes = std::move(NonFilteredNodes); + } + return Nodes; + }; + + auto getMaxNumWaveAll = [&](InstNodePtrList &Nodes) -> InstNodePtrList & { + // Experimental heuristic: Add only maxnum (llvm.maxnum) and waveall instructions to the list + // The idea is that maxnum->waveall(max) is a common pattern + // that usually leads to decreasing the register pressure + // because all the lanes converge to the same value + + InstNodePtrList NonFilteredNodes; + for (InstructionNode *Node : Nodes) { + if (GenIntrinsicInst *Intr = dyn_cast(Node->I)) { + if (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveAll) { + NonFilteredNodes.push_back(Node); + } + } + else if (IntrinsicInst *Intr = llvm::dyn_cast(Node->I)) { + if (Intr->getIntrinsicID() == Intrinsic::maxnum) { + NonFilteredNodes.push_back(Node); + } + } + } + if (NonFilteredNodes.size() > 0) { + Nodes = std::move(NonFilteredNodes); + } + return Nodes; + }; + // === === // === Choosing if we have instructions to schedule immediately === // === === @@ -2155,6 +2372,11 @@ class BBScheduler { IGC_ASSERT(ReadyList.size() > 0); + PrintDumpLevel(VerbosityLevel::Medium, "Choosing from the ready list:\n"); + for (InstructionNode *N : ReadyList) { + PrintInstructionDumpLevel(VerbosityLevel::Medium, N->I); + } + // Filter ReadyList so that only if the instruction is Handicapped // It will remain only if the current regpressure is lower that the Handicapped value InstNodePtrList FilteredReadyList; @@ -2174,6 +2396,7 @@ class BBScheduler { } FilteredReadyList = filterOutNotReadyRematInstructions(FilteredReadyList); + FilteredReadyList = filterOutNotReadyIcmp(FilteredReadyList); IGC_ASSERT(FilteredReadyList.size() > 0); @@ -2189,6 +2412,9 @@ class BBScheduler { // regpressure, if several, choose the one with the least OriginalPosition FilteredReadyList = getMaxWeightNodes(FilteredReadyList); FilteredReadyList = getLowestRegpressureNodes(FilteredReadyList); + if (C[Option::FocusLoadsOnOneDPAS]) { + FilteredReadyList = focusLoadsOnOneDPAS(FilteredReadyList); + } Node = getFirstNode(FilteredReadyList); bool IsRegpressureCritical = RT.isRegpressureCritical(Node->I); CanClone = RT.isRegpressureHigh(Node->I) || isLargeLoad(Node->I); @@ -2208,6 +2434,9 @@ class BBScheduler { FilteredReadyList = getLargeBlockLoadsIfExist(FilteredReadyList); } + if (C[Option::PrioritizeMaxnumWaveallHighRP]) { + FilteredReadyList = getMaxNumWaveAll(FilteredReadyList); + } if (C[Option::PrioritizeDPASHighRP]) { // Experimental heuristic: prioritize DPAS and the instructions that make it possible to // schedule DPAS earlier @@ -2219,8 +2448,16 @@ class BBScheduler { FilteredReadyList = getLoadsThatUnlockDPASes(FilteredReadyList, C[Option::PrioritizeLoadsThatUnlockDPASesHighRP_MaxLoadSize]); } + if (C[Option::PrioritizePopulatingOneVectorHighRP]) { + FilteredReadyList = filterOutNotUnblockingExistingVectorInst(FilteredReadyList); + } FilteredReadyList = getLowestRegpressureNodes(FilteredReadyList); + + if (C[Option::FocusLoadsOnOneDPAS]) { + FilteredReadyList = focusLoadsOnOneDPAS(FilteredReadyList); + } + // If we have several nodes with the same regpressure, choose the one with the highest MaxWeight FilteredReadyList = getMaxWeightNodes(FilteredReadyList, C[Option::UseHighRPWeight] == 1); diff --git a/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h b/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h index 6f4ba39dc449..f7f66989e113 100644 --- a/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h +++ b/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h @@ -32,6 +32,8 @@ DECLARE_SCHEDULING_OPTION(WeightDPASDstDepHighRP, 6000, "Edge weight for DPAS destination dependency under high register pressure") DECLARE_SCHEDULING_OPTION(WeightExtendedMathDstDep, 200, "Edge weight for extended math destination dependency") DECLARE_SCHEDULING_OPTION(WeightWaveAllDstDep, 10, "Edge weight for wave all destination dependency") +DECLARE_SCHEDULING_OPTION(WeightWaveAllDstDepHighRP, 20, "Edge weight for wave all destination dependency under high " + "register pressure") DECLARE_SCHEDULING_OPTION(WeightUnknownMemoryReadDstDep, 500, "Edge weight for unknown memory read destination dependency") DECLARE_SCHEDULING_OPTION(WeightUnknownVectorShuffleDstDep, 50, @@ -64,6 +66,22 @@ DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP, 1, DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP_MaxLoadSize, 32, "Heuristic: Maximum load size (in number of elements) to consider for " "prioritizing loads that unlock DPAS instructions") +DECLARE_SCHEDULING_OPTION(FocusLoadsOnOneDPAS, 1, + "Heuristic: Focus loads on one DPAS instruction in case we have to choose from " + "many loads") +DECLARE_SCHEDULING_OPTION(AllowLargerRPWindowRPThreshold, 200, + "Heuristic: Allow larger register pressure window if register pressure is higher than " + "a threshold, so allow also the instructions that have not lowest but similar register " + "pressure, the threshold in bytes") +DECLARE_SCHEDULING_OPTION(AllowLargerRPWindowSize, 64, + "Heuristic: Allow larger register pressure window if register pressure is higher than " + "a threshold, so allow also the instructions that have not lowest but similar register " + "pressure, the size of the window in bytes") +DECLARE_SCHEDULING_OPTION(PrioritizeMaxnumWaveallHighRP, 0, + "Heuristic: Maxnum and Waveall instructions are prioritized when register pressure is " + "high") +DECLARE_SCHEDULING_OPTION(PrioritizePopulatingOneVectorHighRP, 1, + "Heuristic: Prioritize populating one vector when register pressure is high") // RP management control options DECLARE_SCHEDULING_OPTION(GreedyRPThresholdDelta, 20, "Threshold delta for greedy register pressure scheduling") diff --git a/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp b/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp index 24d016abc1e3..c4407512fb76 100644 --- a/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp +++ b/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp @@ -43,7 +43,7 @@ static bool hasRematMetadata(llvm::Value *V) { return false; } -Value *getAddressOperand(llvm::Instruction *I) { +Value *getRematedOperand(llvm::Instruction *I) { if (!I) return nullptr; // Check if the instruction is a Load or Store and return the address operand @@ -51,6 +51,9 @@ Value *getAddressOperand(llvm::Instruction *I) { return LI->getPointerOperand(); } else if (auto *SI = dyn_cast(I)) { return SI->getPointerOperand(); + } else if (auto *SelI = dyn_cast(I)) { + // For SelectInst, return the condition operand + return SelI->getCondition(); } // If it's not a Load or Store, return nullptr @@ -69,7 +72,7 @@ RematChainSet getRematChain(Value *V, Instruction *User) { if (!isa(I) && !isa(I) && !isa(I) && !isa(I) - && !isa(I) && !isa(I)) { + && !isa(I) && !isa(I) && !isa(I)) { return {}; } @@ -96,15 +99,15 @@ RematChainSet getRematChain(Value *V, Instruction *User) { bool RematChainsAnalysis::runOnFunction(llvm::Function &F) { for (auto &BB : F) { for (Instruction &I : BB) { - Value *AddrOperand = getAddressOperand(&I); - if (!AddrOperand) + Value *Operand = getRematedOperand(&I); + if (!Operand) continue; - Instruction *AI = dyn_cast(AddrOperand); + Instruction *AI = dyn_cast(Operand); if (!AI) continue; - RematChainSet Chain = getRematChain(AddrOperand, &I); + RematChainSet Chain = getRematChain(Operand, &I); if (!Chain.empty()) { RematChainPatterns.push_back(std::make_unique(Chain, AI, &I)); diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp index 19cb3dfb9e57..15594de8b2ad 100644 --- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp +++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp @@ -244,7 +244,8 @@ void AddAnalysisPasses(CodeGenContext &ctx, IGCPassManager &mpm) { ctx.m_instrTypes.numInsts >= IGC_GET_FLAG_VALUE(CodeLoopSinkingMinSize)) { mpm.add(new CodeLoopSinking()); } - if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER)) { + if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER) && + (ctx.platform.isCoreChildOf(IGFX_XE_HPC_CORE) || ctx.platform.isCoreChildOf(IGFX_XE2_HPG_CORE))) { if (IGC_IS_FLAG_DISABLED(CodeSchedulingOnlyRecompilation) || ctx.m_retryManager.AllowCodeScheduling()) { mpm.add(new CodeScheduling()); } diff --git a/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll b/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll index 983bd961a46a..8fae2496b9bf 100644 --- a/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll +++ b/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll @@ -27,22 +27,16 @@ define spir_kernel void @test_remat(ptr addrspace(1) %A, i32 %x) { ; CHECK: entry: ; CHECK: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK: [[REMAT1_1:%.*]] = or i32 [[X:%.*]], 10, !remat !0 -; CHECK: [[REMAT1_2:%.*]] = shl nuw nsw i32 [[REMAT1_1]], 6, !remat !0 -; CHECK: [[REMAT1_3:%.*]] = or i32 [[REMAT1_2]], 16, !remat !0 -; CHECK: [[REMAT1_4:%.*]] = or i32 [[REMAT1_3]], [[REMAT1_2]], !remat !0 -; CHECK: [[REMAT1_5:%.*]] = shl nuw nsw i32 [[REMAT1_4]], 1, !remat !0 -; CHECK: [[CLONED_1:%.*]] = inttoptr i32 [[REMAT1_5]] to ptr addrspace(3), !remat !0 +; ... +; CHECK: [[REMAT1_5:%.*]] = shl nuw nsw i32 [[REMAT1_4:.*]], 1 +; CHECK: [[CLONED_1:%.*]] = inttoptr i32 [[REMAT1_5]] to ptr addrspace(3) ; CHECK: [[LOAD_1:%.*]] = load <8 x i16>, ptr addrspace(3) [[CLONED_1]], align 2 - -; CHECK: [[REMAT2_1:%.*]] = or i32 [[X]], 18 -; CHECK: [[REMAT2_2:%.*]] = shl nuw nsw i32 [[REMAT2_1]], 6 -; CHECK: [[REMAT2_3:%.*]] = or i32 [[REMAT2_2]], 16 -; CHECK: [[REMAT2_4:%.*]] = shl nuw nsw i32 [[REMAT2_3]], 1 +; ... +; CHECK: [[REMAT2_4:%.*]] = shl nuw nsw i32 [[REMAT2_3:.*]], 1 ; CHECK: [[CLONED_2:%.*]] = inttoptr i32 [[REMAT2_4]] to ptr addrspace(3) ; CHECK: [[LOAD_2:%.*]] = load <8 x i16>, ptr addrspace(3) [[CLONED_2]], align 2 -; CHECK: [[REMAT3_1:%.*]] = or i32 [[X]], 10, !remat !0 +; CHECK: [[REMAT3_1:%.*]] = or i32 [[X:.*]], 10, !remat !0 ; CHECK: [[REMAT3_2:%.*]] = shl nuw nsw i32 [[REMAT3_1]], 6, !remat !0 ; CHECK: [[REMAT3_3:%.*]] = or i32 [[REMAT3_2]], 16, !remat !0 ; CHECK: [[REMAT3_4:%.*]] = shl nuw nsw i32 [[REMAT3_3]], 1, !remat !0 @@ -108,6 +102,33 @@ bb1: ret void } + +define spir_kernel void @test_remat_select(ptr addrspace(1) %A, i32 %x, i32 %z) { +; CHECK-LABEL: @test_remat_select( +; CHECK: bb1: +; CHECK: [[DPAS2:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> undef, i32 1, i32 1, i32 1, i32 1, i1 false) +; CHECK: [[REMAT_OR:%.*]] = or i32 [[X:%.*]], 10, !remat !0 +; CHECK: [[REMAT_ICMP:%.*]] = icmp eq i32 [[REMAT_OR]], 15, !remat !0 +; CHECK: [[SEL:%.*]] = select i1 [[REMAT_ICMP]], i32 [[X]], i32 [[Z:%.*]] +; CHECK: [[ADD:%.*]] = add i32 [[REMAT_OR]], 2000 +; CHECK: ret void +; +entry: + br label %bb1 + +bb1: + %remat_or = or i32 %x, 10, !remat !0 + %remat_icmp = icmp eq i32 %remat_or, 15, !remat !0 + %sel = select i1 %remat_icmp, i32 %x, i32 %z + %add = add i32 %remat_or, 2000 + + + %dpas2 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32( + <8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> undef, + i32 1, i32 1, i32 1, i32 1, i1 false) + ret void +} + declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32( <8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1) diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h index bed2dda36fc7..41608f013f81 100644 --- a/IGC/common/igc_flags.h +++ b/IGC/common/igc_flags.h @@ -279,7 +279,7 @@ DECLARE_IGC_REGKEY(DWORD, LS_splitThresholdDelta_GRF, 2, "Register pressure must exceed total GRFs by this much for the load splitting to fire up.", false) // Code Scheduling -DECLARE_IGC_REGKEY(bool, DisableCodeScheduling, true, "Disable local code scheduling", true) +DECLARE_IGC_REGKEY(bool, DisableCodeScheduling, false, "Disable local code scheduling", true) DECLARE_IGC_REGKEY(bool, CodeSchedulingOnlyRecompilation, false, "Enable code scheduling only on 2nd try", true) DECLARE_IGC_REGKEY(bool, EnableCodeSchedulingIfNoSpills, false, "Try rescheduling also when there are no spills", true) From 22dc83aac699cbd7a99ae4dc18b858ccf0063743 Mon Sep 17 00:00:00 2001 From: "Liou, Jhe-Yu" Date: Thu, 4 Sep 2025 16:37:30 +0000 Subject: [PATCH 3/5] Diable PromoteLoopUnrollWithAlloc for OCL Diable PromoteLoopUnrollWithAlloc for OCL (cherry picked from commit bab52a98ce97205380fbe9c64c4246d943368cb4) --- IGC/Compiler/GenTTI.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/IGC/Compiler/GenTTI.cpp b/IGC/Compiler/GenTTI.cpp index 4cedf6300685..a9321ff95ecb 100644 --- a/IGC/Compiler/GenTTI.cpp +++ b/IGC/Compiler/GenTTI.cpp @@ -359,15 +359,17 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (AllocaFound) { // LLVM default only to 10, boost to UnrollMaxCountForAlloca UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca; - UP.Threshold += ThresholdBoost; UP.UpperBound = true; UP.Force = UnrollLoopForCodeSizeOnly ? false : true; - LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold - << " due to Alloca accessed by:"); - for (const auto &pair : isGEPLoopInduction) - LLVM_DEBUG(dbgs() << " " << pair.first->getName()); - LLVM_DEBUG(dbgs() << " \n"); + if (ctx->type != ShaderType::OPENCL_SHADER) { + UP.Threshold += ThresholdBoost; + LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold + << " due to Alloca accessed by:"); + for (const auto &pair : isGEPLoopInduction) + LLVM_DEBUG(dbgs() << " " << pair.first->getName()); + LLVM_DEBUG(dbgs() << " \n"); + } } } From 300f146e62494c68dd373e35e7436f93e77360e1 Mon Sep 17 00:00:00 2001 From: "Sukhov, Egor" Date: Tue, 2 Sep 2025 09:22:48 +0000 Subject: [PATCH 4/5] Fix for IGCVectorizer insertpoint Now for small blocks consisting of 2 special case instructions PHI & Terminator (BR or RET for example) we return not firstnonPHI but the last PHI. (cherry picked from commit 0b9518e0194b68f8f7172caa12dfe03736930bd9) --- IGC/Compiler/CISACodeGen/IGCVectorizer.cpp | 2 + .../vectorizer-special-bb-to-insert.ll | 70 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll diff --git a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp index e0b9f1b57917..d49dd16cbf8c 100644 --- a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp +++ b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp @@ -460,6 +460,8 @@ Instruction *IGCVectorizer::getInsertPointForVector(VecArr &Arr) { // if insert point is PHI, shift it to the first nonPHI to be safe if (llvm::isa(InsertPoint)) InsertPoint = InsertPoint->getParent()->getFirstNonPHI(); + if (InsertPoint->isTerminator()) + InsertPoint = InsertPoint->getPrevNonDebugInstruction(); return InsertPoint; } diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll new file mode 100644 index 000000000000..005a1ed0e31c --- /dev/null +++ b/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll @@ -0,0 +1,70 @@ +;=========================== begin_copyright_notice ============================ +; +; Copyright (C) 2025 Intel Corporation +; +; SPDX-License-Identifier: MIT +; +;============================ end_copyright_notice ============================= + +; RUN: igc_opt -S --igc-vectorizer -dce --regkey=VectorizerDepWindowMultiplier=6 < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: bb3: +; CHECK-NEXT: [[PHI:%.*]] = phi float +; CHECK-NEXT: [[VECTOR_0:%.*]] = insertelement <8 x float> undef, float [[PHI]], i32 0 +; CHECK-NEXT: [[VECTOR_1:%.*]] = insertelement <8 x float> [[VECTOR_0]], float [[PHI]], i32 1 +; CHECK-NEXT: [[VECTOR_2:%.*]] = insertelement <8 x float> [[VECTOR_1]], float [[PHI]], i32 2 +; CHECK-NEXT: [[VECTOR_3:%.*]] = insertelement <8 x float> [[VECTOR_2]], float [[PHI]], i32 3 +; CHECK-NEXT: [[VECTOR_4:%.*]] = insertelement <8 x float> [[VECTOR_3]], float [[PHI]], i32 4 +; CHECK-NEXT: [[VECTOR_5:%.*]] = insertelement <8 x float> [[VECTOR_4]], float [[PHI]], i32 5 +; CHECK-NEXT: [[VECTOR_6:%.*]] = insertelement <8 x float> [[VECTOR_5]], float [[PHI]], i32 6 +; CHECK-NEXT: [[VECTOR_7:%.*]] = insertelement <8 x float> [[VECTOR_6]], float [[PHI]], i32 7 +; CHECK-NEXT: br i1 {{%.*}}, label {{%.*}}, label {{%.*}} + + +define spir_kernel void @barney() { +bb: + %tmp = fcmp une float 0.000000e+00, 0.000000e+00 + br label %bb1 + +bb1: ; preds = %bb + br i1 false, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + br label %bb3 + +bb3: ; preds = %bb2, %bb1 + %tmp4 = phi float [ 0.000000e+00, %bb1 ], [ 0.000000e+00, %bb2 ] + br i1 %tmp, label %bb5, label %bb6 + +bb5: ; preds = %bb3 + br label %bb6 + +bb6: ; preds = %bb5, %bb3 + %tmp7 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp8 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp9 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp10 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp11 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp12 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp13 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp14 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4 + %tmp15 = insertelement <8 x float> zeroinitializer, float %tmp7, i64 0 + %tmp16 = insertelement <8 x float> %tmp15, float %tmp8, i64 1 + %tmp17 = insertelement <8 x float> %tmp16, float %tmp9, i64 2 + %tmp18 = insertelement <8 x float> %tmp17, float %tmp10, i64 3 + %tmp19 = insertelement <8 x float> %tmp18, float %tmp11, i64 4 + %tmp20 = insertelement <8 x float> %tmp19, float %tmp12, i64 5 + %tmp21 = insertelement <8 x float> %tmp20, float %tmp13, i64 6 + %tmp22 = insertelement <8 x float> %tmp21, float %tmp14, i64 7 + %tmp23 = bitcast <8 x float> %tmp22 to <8 x i32> + call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> %tmp23) + ret void +} + +declare void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>) + +!igc.functions = !{!0} + +!0 = distinct !{void ()* @barney, !1} +!1 = distinct !{!2} +!2 = distinct !{!"sub_group_size", i32 16} From bd67908e0b06013e83ee02c9f3ff10cf976ed96a Mon Sep 17 00:00:00 2001 From: Anastasia Bodrova Date: Mon, 1 Sep 2025 10:03:34 +0000 Subject: [PATCH 5/5] Changes in code. (cherry picked from commit cf2dc92ae5d8c0fc0fb70c00079690fbb35cdbcf) --- IGC/Compiler/CISACodeGen/DeSSA.cpp | 2 +- .../CISACodeGen/VariableReuseAnalysis.cpp | 72 +++++++-------- .../CISACodeGen/VariableReuseAnalysis.hpp | 4 +- .../EmitVISAPass/inline_asm_vectoralias.ll | 89 ------------------- 4 files changed, 36 insertions(+), 131 deletions(-) delete mode 100644 IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll diff --git a/IGC/Compiler/CISACodeGen/DeSSA.cpp b/IGC/Compiler/CISACodeGen/DeSSA.cpp index 1f386f0cab84..263e21aea84d 100644 --- a/IGC/Compiler/CISACodeGen/DeSSA.cpp +++ b/IGC/Compiler/CISACodeGen/DeSSA.cpp @@ -1542,7 +1542,7 @@ bool DeSSA::isAliasee(Value *V) const { // c = 2 // ... // L: = a -// = c +// = b // // In this case, if a is aliased to b, a would get 2 at L, but the correct // value should be 1. In order to find out if a can be aliased to b, it diff --git a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp index 72ca0b44fe28..5165a6bb85a7 100644 --- a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp +++ b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp @@ -887,16 +887,16 @@ bool VariableReuseAnalysis::getAllInsEltsIfAvailable(InsertElementInst *FirstIEI IGC_ASSERT_MESSAGE(IEI_ix < nelts, "ICE: IEI's index out of bound!"); SVecInsEltInfo &InsEltInfo = AllIEIs[IEI_ix]; if (InsEltInfo.IEI) { - // This element is inserted more than once, skip. + // One element is inserted more than once, skip. return false; } InsEltInfo.IEI = I; InsEltInfo.Elt = E; InsEltInfo.FromVec = V; InsEltInfo.FromVec_eltIx = V_ix; - - // So far, E is never nullptr (could be in the future) - InsEltInfo.EEI = dyn_cast_or_null(E); + if (E) { + InsEltInfo.EEI = dyn_cast(E); + } if (!I->hasOneUse()) { break; @@ -923,24 +923,19 @@ bool VariableReuseAnalysis::getAllInsEltsIfAvailable(InsertElementInst *FirstIEI if (tV == nullptr) return false; - // Expect all IEIs are in the same DeSSA CC (DeSSA special-handles IEIs) + // Expect node values for all IEIs are identical. In general, if they + // are in the same DeSSA CC, that would be fine. Value *tV_nv = m_DeSSA->getNodeValue(tV); if (V_root != getRootValue(tV_nv)) return false; Value *E = AllIEIs[i].Elt; - if (!E || isa(E)) { - // constant is okay for either non-uniform or uniform. - continue; - } Value *FromVec = AllIEIs[i].FromVec; - if (FromVec) { - Value *FromVec_nv = m_DeSSA->getNodeValue(FromVec); - // check if FromVec has been coalesced with IEI already by DeSSA. - // (Wouldn't happen under current DeSSA, but might happen in future) - if (V_root == getRootValue(FromVec_nv)) - return false; - } + Value *FromVec_nv = m_DeSSA->getNodeValue(FromVec); + // check if FromVec has been coalesced with IEI already by DeSSA. + // (Wouldn't happen under current DeSSA, but might happen in future) + if (V_root == getRootValue(FromVec_nv)) + return false; // Make sure FromVec or E have the same uniformness as V. if ((E && V_dep != m_WIA->whichDepend(E)) || (FromVec && V_dep != m_WIA->whichDepend(FromVec))) @@ -974,13 +969,17 @@ Value *VariableReuseAnalysis::traceAliasValue(Value *V) { } // -// Returns true if there is the following pattern; otherwise return false. +// Returns true if the following is true // IEI = insertElement Vec, S, -// 1. S is from another vector V. -// S = extractElement V, -// In this case, S is the element denoted by (V, V_ix) -// 2. otherwise, V=nullptr, V_ix=0. -// S is a candidate and could be alias to the vector. +// Return false, otherwise. +// +// When the above condition is true, V and V_ix are used for the +// following cases: +// 1. S is from another vector V. +// S = extractElement V, +// S is the element denoted by (V, V_ix) +// 2. otherwise, V=nullptr, V_ix=0. +// S is a candidate inserted and could be alias to the vector. // // Input: IEI // Output: IEI_ix, S, V, V_ix @@ -1000,9 +999,9 @@ bool VariableReuseAnalysis::getElementValue(InsertElementInst *IEI, int &IEI_ix, IEI_ix = (int)CI->getZExtValue(); Value *elem0 = IEI->getOperand(1); - if (hasBeenPayloadCoalesced(elem0) || isOrCoalescedWithArg(elem0)) { - // If elem0 has been payload-coalesced or it has been aliased to - // an argument, skip it. + if (hasBeenPayloadCoalesced(elem0) || isa(elem0) || isOrCoalescedWithArg(elem0)) { + // If elem0 has been payload-coalesced, is constant, + // or it has been aliased to an argument, skip it. return false; } @@ -1047,10 +1046,11 @@ void VariableReuseAnalysis::InsertElementAliasing(Function *F) { // IGC Key VectorAlias controls vectorAlias optimiation. // - // VectorAlias (also from m_pCtx->getVectorCoalescingControl()) - // 0x0: disable vector aliasing - // 0x1: subvec aliasing for isolated values (getRootValue()=null) - // 0x2: subvec aliasing for both isolated and non-isolated value) + // Do it if VectorAlias != 0. + // VectorAlias=0x1: subvec aliasing for isolated values + // (getRootValue()=null) + // =0x2: subvec aliasing for both isolated and non-isolated + // value) const auto control = (m_pCtx->getVectorCoalescingControl() & 0x3); // To avoid increasing GRF pressure, skip if F is too large or not an entry const int32_t NumBBThreshold = IGC_GET_FLAG_VALUE(VectorAliasBBThreshold); @@ -1253,7 +1253,6 @@ bool VariableReuseAnalysis::processInsertTo(BasicBlock *BB, VecInsEltInfoTy &All isSubCandidate = false; } - // So far, Elt is never nullptr (could be in the future) if (Elt && Sub == nullptr && skipScalarAliaser(BB, Elt)) { // Skip scalar coalescing isSubCandidate = false; @@ -1434,11 +1433,8 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateUse(Value } } else if (StoreInst *SI = dyn_cast(Val)) { retSt = AState::TARGET; - } else if (CallInst *CallI = dyn_cast(Val)) { - if (CallI->isInlineAsm()) - retSt = AState::TARGET; - else - return AState::SKIP; + } else if (isa(Val)) { + return AState::SKIP; } } return retSt; @@ -1464,9 +1460,7 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateDef(Value } } else if (LoadInst *SI = dyn_cast(Val)) { return AState::TARGET; - } else if (CallInst *CallI = dyn_cast(Val)) { - if (CallI->isInlineAsm()) - return AState::TARGET; + } else if (isa(Val)) { return AState::SKIP; } return AState::OK; @@ -1474,7 +1468,7 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateDef(Value // Vector alias disables extractMask optimization. This function // checks if extractMask optim can be applied. And the caller -// will decide whether to favor extractMask optimization or not. +// will decide whether to favor extractMask optimization. bool VariableReuseAnalysis::isExtractMaskCandidate(Value *V) const { auto BIT = [](int n) { return (uint32_t)(1 << n); }; diff --git a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp index 612ca1df376a..2ade6f51d087 100644 --- a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp +++ b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp @@ -131,8 +131,8 @@ struct SVecInsEltInfo { llvm::InsertElementInst *IEI; llvm::Value *Elt; - // EEI, if not nullptr, is used as scalar operand of IEI and is the same as - // (FromVec, FromVec_eltIx). + // If Elt is null, EEI must not be null. EEI is used as scalar operand + // in IEI and is the same as (FromVec, FromVec_eltIx). llvm::ExtractElementInst *EEI; llvm::Value *FromVec; int FromVec_eltIx; diff --git a/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll b/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll deleted file mode 100644 index ea12ea04c0f3..000000000000 --- a/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll +++ /dev/null @@ -1,89 +0,0 @@ -;=========================== begin_copyright_notice ============================ -; -; Copyright (C) 2023 Intel Corporation -; -; SPDX-License-Identifier: MIT -; -;============================ end_copyright_notice ============================= - -; To vector alias on inline asm - -; REQUIRES: llvm-14-plus, regkeys - -; RUN: igc_opt --opaque-pointers --CheckInstrTypes --igc-update-instrtypes-on-run -inputocl --neo \ -; RUN: -platformpvc -igc-emit-visa -regkey DumpVISAASMToConsole,VectorAlias=1 -simd-mode 16 %s \ -; RUN: | FileCheck %s - -; CHECK-LABEL: .function -; CHECK: lsc_load_block2d.ugm (M1, 1) [[INPUT:.*]]:d8.16x8nt flat[{{.+}}] -; CHECK: mov (M1_NM, 16) [[OUTPUT:.*]](0,0)<2> [[INPUT]](0,0)<4;1,0> -; CHECK: mov (M1_NM, 16) [[OUTPUT]](0,1)<2> [[INPUT]](0,1)<4;1,0> -; CHECK: mov (M1_NM, 16) [[OUTPUT]](1,0)<2> [[INPUT]](0,2)<4;1,0> -; CHECK: mov (M1_NM, 16) [[OUTPUT]](1,1)<2> [[INPUT]](0,3)<4;1,0> -; CHECK: mov (M1, 16) [[OUTPUT]](2,0)<1> 0x0:w -; CHECK: mov (M1, 16) [[OUTPUT]](2,16)<1> 0x0:w -; CHECK: mov (M1, 16) [[OUTPUT]](3,0)<1> 0x0:w -; CHECK: mov (M1, 16) [[OUTPUT]](3,16)<1> 0x0:w -; CHECK: lsc_store_block2d.ugm (M1, 1) flat[{{.+}}] [[OUTPUT]]:d16.16x8nn -; CHECK: ret (M1, 1) - -; Function Attrs: convergent nounwind null_pointer_is_valid -define spir_kernel void @test(i8 addrspace(1)* align 1 %a, i16 addrspace(1)* align 2 %b, <8 x i32> %r0, <8 x i32> %payloadHeader, i8 addrspace(2)* %constBase, i32 %bufferOffset, i32 %bufferOffset1) { -entry: - %0 = call <8 x i8> asm "lsc_load_block2d.ugm (M1, 1) $0:d8.16x8nt flat[$1,15,15,15,0,0]", "=rw,rw.u"(i8 addrspace(1)* %a) - %1 = extractelement <8 x i8> %0, i32 0 - %2 = insertelement <4 x i8> undef, i8 %1, i32 0 - %3 = extractelement <8 x i8> %0, i32 1 - %4 = insertelement <4 x i8> %2, i8 %3, i32 1 - %5 = extractelement <8 x i8> %0, i32 2 - %6 = insertelement <4 x i8> %4, i8 %5, i32 2 - %7 = extractelement <8 x i8> %0, i32 3 - %8 = insertelement <4 x i8> %6, i8 %7, i32 3 - %9 = call <4 x i16> asm "mov (M1_NM, 16) $0(0,0)<2> $1(0,0)<4;1,0>\0Amov (M1_NM, 16) $0(0,1)<2> $1(0,1)<4;1,0>\0Amov (M1_NM, 16) $0(1,0)<2> $1(0,2)<4;1,0>\0Amov (M1_NM, 16) $0(1,1)<2> $1(0,3)<4;1,0>\0A", "=rw,rw"(<4 x i8> %8) - %10 = extractelement <4 x i16> %9, i32 0 - %11 = extractelement <4 x i16> %9, i32 1 - %12 = extractelement <4 x i16> %9, i32 2 - %13 = extractelement <4 x i16> %9, i32 3 - %14 = insertelement <8 x i16> undef, i16 %10, i32 0 - %15 = insertelement <8 x i16> %14, i16 %11, i32 1 - %16 = insertelement <8 x i16> %15, i16 %12, i32 2 - %17 = insertelement <8 x i16> %16, i16 %13, i32 3 - %18 = insertelement <8 x i16> %17, i16 0, i32 4 - %19 = insertelement <8 x i16> %18, i16 0, i32 5 - %20 = insertelement <8 x i16> %19, i16 0, i32 6 - %21 = insertelement <8 x i16> %20, i16 0, i32 7 - call void asm sideeffect "lsc_store_block2d.ugm (M1, 1) flat[$1,15,15,15,0,0] $0:d16.16x8nn", "rw,rw.u"(<8 x i16> %21, i16 addrspace(1)* %b) - ret void -} - - -!igc.functions = !{!0} -!IGCMetadata = !{!13} - -!0 = !{void (i8 addrspace(1)*, i16 addrspace(1)*, <8 x i32>, <8 x i32>, i8 addrspace(2)*, i32, i32)* @test, !1} -!1 = !{!2, !3} -!2 = !{!"function_type", i32 0} -!3 = !{!"sub_group_size", i32 16} -!13 = !{!"ModuleMD", !14} -!14 = !{!"FuncMD", !15, !16} -!15 = !{!"FuncMDMap[0]", void (i8 addrspace(1)*, i16 addrspace(1)*, <8 x i32>, <8 x i32>, i8 addrspace(2)*, i32, i32)* @test} -!16 = !{!"FuncMDValue[0]", !100, !226} -!100 = !{!"resAllocMD", !183, !184, !185, !186} -!183 = !{!"uavsNumType", i32 0} -!184 = !{!"srvsNumType", i32 0} -!185 = !{!"samplersNumType", i32 0} -!186 = !{!"argAllocMDList", !187, !191, !192, !193, !194, !195, !196} -!187 = !{!"argAllocMDListVec[0]", !188, !189, !190} -!188 = !{!"type", i32 0} -!189 = !{!"extensionType", i32 -1} -!190 = !{!"indexType", i32 -1} -!191 = !{!"argAllocMDListVec[1]", !188, !189, !190} -!192 = !{!"argAllocMDListVec[2]", !188, !189, !190} -!193 = !{!"argAllocMDListVec[3]", !188, !189, !190} -!194 = !{!"argAllocMDListVec[4]", !188, !189, !190} -!195 = !{!"argAllocMDListVec[5]", !188, !189, !190} -!196 = !{!"argAllocMDListVec[6]", !188, !189, !190} -!226 = !{!"m_OpenCLArgTypeQualifiers", !227, !228} -!227 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""} -!228 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""} -