From 4e063ed79073db778e7ddbbadd53b21798b2de25 Mon Sep 17 00:00:00 2001
From: bcheng0127 <bu.qi.cheng@intel.com>
Date: Tue, 2 Sep 2025 18:09:21 +0000
Subject: [PATCH 1/5] Changes in code.

(cherry picked from commit 6ee64d4cbb20915f4f86a4d8ad3ef479d0138c42)
---
 .../PromoteToPredicatedMemoryAccess/LoadSubDW.cl   |  2 ++
 visa/LocalScheduler/Dependencies_G4IR.cpp          |  9 ---------
 visa/LocalScheduler/Dependencies_G4IR.h            |  2 --
 visa/LocalScheduler/LocalScheduler_G4IR.cpp        | 14 +-------------
 4 files changed, 3 insertions(+), 24 deletions(-)

diff --git a/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl b/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl
index c2ede424bc56..805d278a40c3 100644
--- a/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl
+++ b/IGC/ocloc_tests/optimizations/PromoteToPredicatedMemoryAccess/LoadSubDW.cl
@@ -13,6 +13,8 @@ SPDX-License-Identifier: MIT
 // RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'EnablePromoteToPredicatedMemoryAccess=1 VISAOptions=-asmToConsole'" 2>&1 | FileCheck %s --check-prefixes=CHECK-ASM
 
 // CHECK-ASM: kernel test_i8_0
+// CHECK-ASM:             cmp (32|M0)   (lt)[[F1:f[0-9\.]+]]
+// CHECK-ASM:   ([[F1]]) goto (32|M0)
 // CHECK-ASM: {{[_a-z0-9A-Z]+}}:
 // CHECK-ASM-DAG:         cmp (32|M0)   (le)[[F2:f[0-9\.]+]]   null<1>:d     r{{[0-9\.]+}}
 // CHECK-ASM-DAG:         cmp (32|M0)   (ge)[[F3:f[0-9\.]+]]   null<1>:d     r{{[0-9\.]+}}
diff --git a/visa/LocalScheduler/Dependencies_G4IR.cpp b/visa/LocalScheduler/Dependencies_G4IR.cpp
index 9cc823f884ad..3f2181802d66 100644
--- a/visa/LocalScheduler/Dependencies_G4IR.cpp
+++ b/visa/LocalScheduler/Dependencies_G4IR.cpp
@@ -189,15 +189,6 @@ DepType vISA::getDepScratchSend(G4_INST *curInst, G4_INST *liveInst) {
   return NODEP;
 }
 
-bool vISA::isNotLatencyBarrier (DepType type) {
-  if (type == CONTROL_FLOW_BARRIER ||
-      type == OPT_BARRIER ||
-      type == SEND_BARRIER ||
-      type == MSG_BARRIER)
-    return true;
-  return false;
-}
-
 DepType vISA::CheckBarrier(G4_INST *inst) {
   if (inst->isOptBarrier() || inst->isAtomicInst() || inst->opcode() == G4_madm) {
     return OPT_BARRIER;
diff --git a/visa/LocalScheduler/Dependencies_G4IR.h b/visa/LocalScheduler/Dependencies_G4IR.h
index eec65ba3112a..5e6587ceaee5 100644
--- a/visa/LocalScheduler/Dependencies_G4IR.h
+++ b/visa/LocalScheduler/Dependencies_G4IR.h
@@ -38,8 +38,6 @@ DepType getDepSend(G4_INST *curInst, G4_INST *liveInst, bool BTIIsRestrict);
 
 DepType getDepScratchSend(G4_INST *curInst, G4_INST *liveInst);
 
-bool isNotLatencyBarrier(DepType type);
-
 DepType CheckBarrier(G4_INST *inst);
 
 DepType getDepForOpnd(Gen4_Operand_Number cur, Gen4_Operand_Number liv);
diff --git a/visa/LocalScheduler/LocalScheduler_G4IR.cpp b/visa/LocalScheduler/LocalScheduler_G4IR.cpp
index 7af545b7403b..ccb766c6ec7d 100644
--- a/visa/LocalScheduler/LocalScheduler_G4IR.cpp
+++ b/visa/LocalScheduler/LocalScheduler_G4IR.cpp
@@ -1887,21 +1887,9 @@ void DDD::collectRoots() {
 void DDD::setPriority(Node *pred, const Edge &edge) {
   // Calculate PRED's priority (pred->priority), based on SUCC's priority
   Node *succ = edge.getNode();
-  DepType type = edge.getType();
   vISA_ASSERT(succ->priority != Node::PRIORITY_UNINIT,
          "succ node has no priority?");
-  int newPriority = succ->priority;
-  // Note that, node->isBarrier cannot be used here. Because there may be
-  // non-barrier dep.
-  bool isSend = false;
-  if (!pred->getInstructions()->empty()) {
-    isSend = pred->getInstructions()->front()->isSend();
-  }
-  if (isNotLatencyBarrier(type) &&
-      !isSend) // send has long latency, it's better be scheduled to the front.
-    newPriority += pred->getOccupancy();
-  else
-    newPriority += edge.getLatency();
+  int newPriority = succ->priority + edge.getLatency();
   pred->priority =
       (newPriority > pred->priority) ? newPriority : pred->priority;
 }

From 5fff125e4873fcf5b5d40442e5837c35e6d97388 Mon Sep 17 00:00:00 2001
From: "Dmitrichenko, Aleksei" <aleksei.dmitrichenko@intel.com>
Date: Mon, 18 Aug 2025 13:04:32 +0000
Subject: [PATCH 2/5] Enable CodeScheduling

Improve CodeScheduling and enable it by default
- Support handling of the remated instructions
- Various heuristics added to handle situations with small (splitted)
loads
- Heuristic to populate the same vector added
---
 IGC/Compiler/CISACodeGen/CodeScheduling.cpp   | 315 +++++++++++++++---
 .../CISACodeGen/CodeSchedulingOptionsDef.h    |  18 +
 .../CISACodeGen/RematChainsAnalysis.cpp       |  15 +-
 IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp    |   3 +-
 .../CodeScheduling/remat-inst-handling.ll     |  45 ++-
 IGC/common/igc_flags.h                        |   2 +-
 6 files changed, 339 insertions(+), 59 deletions(-)

diff --git a/IGC/Compiler/CISACodeGen/CodeScheduling.cpp b/IGC/Compiler/CISACodeGen/CodeScheduling.cpp
index 96ac483ebe1b..0b5f47fcab9c 100644
--- a/IGC/Compiler/CISACodeGen/CodeScheduling.cpp
+++ b/IGC/Compiler/CISACodeGen/CodeScheduling.cpp
@@ -465,6 +465,21 @@ class RegisterPressureTracker {
     return V;
   }
 
+  DenseSet<Instruction *> getHangingS2VInstructions() {
+    // return all the vectors that are created of scalars, but not fully populated yet
+    DenseSet<Instruction *> HangingInstructions;
+    for (const auto &HangingLiveVar : HangingLiveVarsVec) {
+      if (HangingLiveVar->Type == HangingLiveVarsType::HANGING_SCALARS_TO_VECTOR) {
+        for (auto *V : HangingLiveVar->LiveVars) {
+          if (Instruction *I = dyn_cast<Instruction>(V)) {
+            HangingInstructions.insert(I);
+          }
+        }
+      }
+    }
+    return HangingInstructions;
+  }
+
 private:
   BasicBlock *BB;
   Function *F;
@@ -492,7 +507,7 @@ class RegisterPressureTracker {
   llvm::DenseMap<Value *, DenseSet<Value *>> RealUsesCache;
   llvm::DenseMap<std::pair<Value *, int32_t>, int32_t> ValueSizeCache;
 
-  typedef enum { HANGING_SCALARS, HANGING_VECTORS, HANGING_NOOP_VECTORS } HangingLiveVarsType;
+  typedef enum { HANGING_SCALARS_TO_VECTOR, HANGING_VECTOR_TO_SCALARS, HANGING_VECTORS, HANGING_NOOP_VECTORS } HangingLiveVarsType;
 
   // POD structure to keep information about hanging values
   struct HangingLiveVarsInfo {
@@ -564,7 +579,7 @@ class RegisterPressureTracker {
     if (RCA && !Update) {
       RematChainPattern *RCP = RCA->getRematChainPattern(I);
       if (RCP && (RCP->getFirstInst() == I)) {
-        // if it's a remat chain we are going to use the remat target instruction (usually load or store)
+        // if it's a remat chain we are going to use the remat target instruction (if it's load or store)
         Instruction *TargetInst = RCP->getRematTargetInst();
         return estimateOrUpdateImpl(TargetInst, false);
       }
@@ -707,7 +722,7 @@ class RegisterPressureTracker {
         auto *FirstIE = DTI->getFirstIE();
         auto *FirstScalar = FirstIE->getOperand(1);
         if (!HangingLiveVars.count(FirstScalar)) {
-          HangingLiveVarsVec.emplace_back(std::make_unique<HangingLiveVarsInfo>(0, HANGING_SCALARS));
+          HangingLiveVarsVec.emplace_back(std::make_unique<HangingLiveVarsInfo>(0, HANGING_SCALARS_TO_VECTOR));
           auto *HLV = HangingLiveVarsVec.back().get();
 
           for (Value *V : DTI->getSourceScalars()) {
@@ -749,7 +764,7 @@ class RegisterPressureTracker {
         if (!HangingLiveVars.count(I)) {
           IGC_ASSERT(V2SP->getSourceVec() == EE->getVectorOperand());
           HangingLiveVarsVec.emplace_back(std::make_unique<HangingLiveVarsInfo>(
-              computeSizeInBytes(V2SP->getSourceVec(), SIMD, WI, *DL), HANGING_SCALARS));
+              computeSizeInBytes(V2SP->getSourceVec(), SIMD, WI, *DL), HANGING_VECTOR_TO_SCALARS));
           auto *HLV = HangingLiveVarsVec.back().get();
           for (Value *V : V2SP->getEEs()) {
             IGC_ASSERT(!HLV->LiveVars.count(V));
@@ -851,7 +866,8 @@ class RegisterPressureTracker {
           {
             if (Update)
               PrintDumpLevel(VerbosityLevel::High, " (hanging vector dies)");
-            if (HLV->Type == HANGING_SCALARS) {
+            if (HLV->Type == HANGING_SCALARS_TO_VECTOR ||
+                HLV->Type == HANGING_VECTOR_TO_SCALARS) {
               // only scalars die
               RPDecrease = HLV->Size;
             } else {
@@ -864,7 +880,8 @@ class RegisterPressureTracker {
                              " (hanging vector, left vars: "
                                  << (HLV->LiveVars.count(RealOp) ? HLV->LiveVars.size() - 1 : HLV->LiveVars.size())
                                  << ")");
-            if (HLV->Type == HANGING_SCALARS) {
+            if (HLV->Type == HANGING_SCALARS_TO_VECTOR ||
+                HLV->Type == HANGING_VECTOR_TO_SCALARS) {
               RPDecrease = 0; // We don't decrease pressure, because the vector is still alive
             }
           }
@@ -1356,7 +1373,7 @@ class BBScheduler {
                    AdditionalWeight;
           }
           case GenISAIntrinsic::GenISA_WaveAll:
-            return C[Option::WeightWaveAllDstDep];
+            return HighRP ? C[Option::WeightWaveAllDstDepHighRP] : C[Option::WeightWaveAllDstDep];
           default:
             break;
           }
@@ -1830,10 +1847,15 @@ class BBScheduler {
         // Sort in ascending order using RT->estimate(Node->I) as a key
         std::sort(Nodes.begin(), Nodes.end(),
                   [&](InstructionNode *A, InstructionNode *B) { return RT.estimate(A->I) < RT.estimate(B->I); });
-        auto LowestRP = RT.estimate(Nodes.front()->I);
+        int32_t LowestRP = RT.estimate(Nodes.front()->I);
         InstNodePtrList LowestRPNodes;
+        if (C[Option::AllowLargerRPWindowRPThreshold] > 0 &&
+            LowestRP >= static_cast<int32_t>(C[Option::AllowLargerRPWindowRPThreshold])) {
+            // If the lowest RP is larger than the threshold, we can allow larger RP window
+            LowestRP += static_cast<int32_t>(C[Option::AllowLargerRPWindowSize]);
+        }
         for (InstructionNode *Node : Nodes) {
-          if (RT.estimate(Node->I) == LowestRP) {
+          if (RT.estimate(Node->I) <= LowestRP) {
             LowestRPNodes.push_back(Node);
           } else {
             break;
@@ -1899,42 +1921,67 @@ class BBScheduler {
         return Nodes;
       };
 
-      auto getLoadsThatUnlockDPASes = [&](InstNodePtrList &Nodes, uint MaxLoadSize) -> InstNodePtrList & {
-        std::function<llvm::DenseSet<Value *>(Instruction *)> getRealUsesThroughVS;
-        getRealUsesThroughVS = [&](Instruction *I) -> llvm::DenseSet<Value *> {
-          llvm::DenseSet<Value *> Uses;
-
-          std::function<void(Value *)> collectUses = [&](Value *V) {
-            for (auto *U : RT.getRealUses(V)) {
-              auto *DV = VSA->getDestVector(U);
-              if (DV && DV->isVectorShuffle()) {
-                collectUses(DV->getLastIE());
-              } else {
-                Uses.insert(U);
-              }
-            }
-          };
+      auto getRealOpThroughVS = [&](Instruction *I) -> Instruction * {
+        Instruction *OpI = dyn_cast<Instruction>(RT.getRealOp(I));
+        if (!OpI) {
+          return nullptr;
+        }
+        auto *DV = VSA->getDestVector(OpI);
+        if (DV && DV->isVectorShuffle()) {
+          auto *SourceVec = dyn_cast<Instruction>(DV->getSourceVec());
+          if (!SourceVec) {
+            return nullptr;
+          }
+          return dyn_cast<Instruction>(RT.getRealOp(SourceVec));
+        }
+        return OpI;
+      };
 
-          collectUses(I);
-          return Uses;
-        };
+      std::function<llvm::DenseSet<Value *>(Instruction *)> getRealUsesThroughVS;
+      getRealUsesThroughVS = [&](Instruction *I) -> llvm::DenseSet<Value *> {
+        llvm::DenseSet<Value *> Uses;
 
-        auto getRealOpThroughVS = [&](Instruction *I) -> Instruction * {
-          Instruction *OpI = dyn_cast<Instruction>(RT.getRealOp(I));
-          if (!OpI) {
-            return nullptr;
+        std::function<void(Value *)> collectUses = [&](Value *V) {
+          for (auto *U : RT.getRealUses(V)) {
+            auto *DV = VSA->getDestVector(U);
+            if (DV && DV->isVectorShuffle()) {
+              collectUses(DV->getLastIE());
+            } else {
+              Uses.insert(U);
+            }
           }
-          auto *DV = VSA->getDestVector(OpI);
-          if (DV && DV->isVectorShuffle()) {
-            auto *SourceVec = dyn_cast<Instruction>(DV->getSourceVec());
-            if (!SourceVec) {
-              return nullptr;
+        };
+
+        collectUses(I);
+        return Uses;
+      };
+
+      std::function<llvm::DenseSet<Value *>(Instruction *)> getRealUsesThroughRematChains;
+      getRealUsesThroughRematChains = [&](Instruction *I) -> llvm::DenseSet<Value *> {
+        llvm::DenseSet<Value *> Uses;
+
+        std::function<void(Value *)> collectUses = [&](Value *V) {
+          for (auto *U : RT.getRealUses(V)) {
+            auto *RematChainPattern = RCA->getRematChainPattern(U);
+            if (RematChainPattern) {
+              // If the use is a remat chain, collect the last instruction in the chain
+              Uses.insert(RematChainPattern->getRematTargetInst());
+            } else {
+              Uses.insert(U);
             }
-            return dyn_cast<Instruction>(RT.getRealOp(SourceVec));
           }
-          return OpI;
         };
 
+        collectUses(I);
+        return Uses;
+      };
+
+      auto getLoadsThatUnlockDPASes = [&](InstNodePtrList &Nodes, uint MaxLoadSize) -> InstNodePtrList & {
+        // We first prioritize the DPASes that don't increase regpressure
+        // if there are loads that unlock these DPASes - filter out all ther instructions
+        // But if there are no DPASes that don't increase regpressure
+        // - we can also consider the ones that do increase
+
         auto getLoadWidth = [&](Instruction *I) -> uint {
           if (GenIntrinsicInst *Intr = dyn_cast<GenIntrinsicInst>(I)) {
             if (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_LSC2DBlockRead ||
@@ -1949,6 +1996,8 @@ class BBScheduler {
         };
 
         InstNodePtrList LoadsThatUnlockDPASes;
+        InstNodePtrList LoadsThatUnlockDPASesNoRPIncreasing;
+
         for (InstructionNode *Node : Nodes) {
           if (!is2dBlockRead(Node->I) || getLoadWidth(Node->I) > MaxLoadSize) {
             continue;
@@ -1962,6 +2011,13 @@ class BBScheduler {
             if (isDPAS(I)) {
 
               bool OneOpIsDPAS = false;
+              bool FirstOpIsZero = false;
+
+              auto *FirstOp = dyn_cast<Constant>(I->getOperand(0));
+              if (FirstOp && (isa<UndefValue>(FirstOp) || FirstOp->isNullValue())) {
+                FirstOpIsZero = true;
+              }
+
               int NumOps = static_cast<int>(I->getNumOperands());
               for (auto &Op : I->operands()) {
                 Instruction *OpI = dyn_cast<Instruction>(Op.get());
@@ -1980,12 +2036,18 @@ class BBScheduler {
               }
               if (NumOps == 0) {
                 LoadsThatUnlockDPASes.push_back(Node);
+                if (!FirstOpIsZero) {
+                  LoadsThatUnlockDPASesNoRPIncreasing.push_back(Node);
+                }
                 break;
               }
             }
           }
         }
-        if (LoadsThatUnlockDPASes.size() > 0) {
+
+        if (LoadsThatUnlockDPASesNoRPIncreasing.size() > 0) {
+          Nodes = std::move(LoadsThatUnlockDPASesNoRPIncreasing);
+        } else if (LoadsThatUnlockDPASes.size() > 0) {
           Nodes = std::move(LoadsThatUnlockDPASes);
         }
         return Nodes;
@@ -2050,6 +2112,161 @@ class BBScheduler {
         return Nodes;
       };
 
+      auto filterOutNotReadyIcmp = [&](InstNodePtrList &Nodes) -> InstNodePtrList & {
+        // Heuristic in order not to put ICMP that is used by a select too early.
+        // Schedule it only when the select is ready
+
+        InstNodePtrList NonFilteredNodes;
+        for (InstructionNode *Node : Nodes) {
+          if (isa<ICmpInst>(Node->I)) {
+            bool IsReady = true;
+            User *U = IGCLLVM::getUniqueUndroppableUser(Node->I);
+            if (!U) {
+              NonFilteredNodes.push_back(Node);
+              continue;
+            }
+            SelectInst *SI = dyn_cast<SelectInst>(U);
+            if (!SI) {
+              NonFilteredNodes.push_back(Node);
+              continue;
+            }
+            // If the select instruction is not ready, we need to filter out the icmp instruction
+            InstructionNode *SelectNode = G.InstToNode[SI];
+            for (const auto &PN : SelectNode->Preds) {
+              if (PN->Src->I == Node->I) {
+                continue;
+              }
+              if (isa<Constant>(PN->Src->I) || isa<PHINode>(PN->Src->I)) {
+                continue;
+              }
+              Instruction *OpI = dyn_cast<Instruction>(PN->Src->I);
+              if (!OpI) {
+                continue;
+              }
+
+              if (!RT.inBBCurrent(OpI)) {
+                // if the instruction is in BBCurrent, then it is ready
+                IsReady = false;
+                break;
+              }
+            }
+            if (IsReady) {
+              NonFilteredNodes.push_back(Node);
+            }
+            // else it's filtered out, until the operand of the select is ready
+          }
+          else {
+            NonFilteredNodes.push_back(Node);
+          }
+        }
+        if (NonFilteredNodes.size() > 0) {
+          Nodes = std::move(NonFilteredNodes);
+        }
+        return Nodes;
+      };
+
+      auto focusLoadsOnOneDPAS = [&](InstNodePtrList &Nodes) -> InstNodePtrList & {
+        // If all Nodes are 2d block loads, choose the dpas user with the lowest initial number and filter out
+        // all the remaining loads. This is needed to avoid a situation when we schedule a lot of small loads first,
+        // but all the DPASes wait for some load that is in the end
+        if (Nodes.size() == 1) {
+          return Nodes;
+        }
+
+        InstNodePtrList NonFilteredNodes;
+        if (std::all_of(Nodes.begin(), Nodes.end(),
+                        [&](InstructionNode *Node) { return is2dBlockRead(Node->I); })) {
+
+          // Get the first DPAS user
+          InstructionNode *FirstDPASUser = nullptr;
+          for (InstructionNode *Node : Nodes) {
+            for (auto *U : getRealUsesThroughVS(Node->I)) {
+              auto *I = dyn_cast<Instruction>(U);
+              if (!I) {
+                continue;
+              }
+
+              if (isDPAS(I)) {
+                if (!FirstDPASUser || (G.InstToNode[I]->OriginalPosition < FirstDPASUser->OriginalPosition)) {
+                  FirstDPASUser = G.InstToNode[I];
+
+                  NonFilteredNodes = {Node};
+                } else if (G.InstToNode[I] == FirstDPASUser) {
+                  NonFilteredNodes.push_back(Node);
+                }
+              }
+            }
+          }
+
+          if (NonFilteredNodes.size() > 0) {
+            Nodes = std::move(NonFilteredNodes);
+          }
+        }
+
+        return Nodes;
+      };
+
+      auto filterOutNotUnblockingExistingVectorInst = [&](InstNodePtrList &Nodes) -> InstNodePtrList & {
+        // If some values are currently hanging because of creating a vector instruction out of scalars
+        // we prioritize the candidates that unblock the other elements of the vector
+
+        // This helps to resolve the issue when we schedule several IEs to the 0th element of different vectors
+        // increasing the regpressure, because the GRF space for the other elements is immediately reserved
+        // but the vectors are not fully populated and we can't use them
+
+        DenseSet<Instruction *> HangingElements = RT.getHangingS2VInstructions();
+        if (HangingElements.empty()) {
+          // If there are no hanging elements, we don't need to filter out anything
+          return Nodes;
+        }
+
+        InstNodePtrList NonFilteredNodes;
+        for (InstructionNode *Node : Nodes) {
+          if (HangingElements.count(Node->I) > 0) {
+            // If the instruction is already hanging, we don't need to filter it out
+            NonFilteredNodes.push_back(Node);
+            continue;
+          }
+          for (Value *V : getRealUsesThroughRematChains(Node->I)) {
+            if (Instruction *I = dyn_cast<Instruction>(V)) {
+              if (HangingElements.count(I) > 0) {
+                NonFilteredNodes.push_back(Node);
+                break; // No need to check other uses, we already found a use that unblocks the vector
+              }
+            }
+          }
+        }
+        if (NonFilteredNodes.size() > 0) {
+          Nodes = std::move(NonFilteredNodes);
+        }
+        return Nodes;
+      };
+
+      auto getMaxNumWaveAll = [&](InstNodePtrList &Nodes) -> InstNodePtrList & {
+        // Experimental heuristic: Add only maxnum (llvm.maxnum) and waveall instructions to the list
+        // The idea is that maxnum->waveall(max) is a common pattern
+        // that usually leads to decreasing the register pressure
+        // because all the lanes converge to the same value
+
+        InstNodePtrList NonFilteredNodes;
+        for (InstructionNode *Node : Nodes) {
+          if (GenIntrinsicInst *Intr = dyn_cast<GenIntrinsicInst>(Node->I)) {
+            if (Intr->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveAll) {
+              NonFilteredNodes.push_back(Node);
+            }
+          }
+          else if (IntrinsicInst *Intr = llvm::dyn_cast<IntrinsicInst>(Node->I)) {
+            if (Intr->getIntrinsicID() == Intrinsic::maxnum) {
+              NonFilteredNodes.push_back(Node);
+            }
+          }
+        }
+        if (NonFilteredNodes.size() > 0) {
+          Nodes = std::move(NonFilteredNodes);
+        }
+        return Nodes;
+      };
+
       // ===                                                          ===
       // === Choosing if we have instructions to schedule immediately ===
       // ===                                                          ===
@@ -2155,6 +2372,11 @@ class BBScheduler {
 
         IGC_ASSERT(ReadyList.size() > 0);
 
+        PrintDumpLevel(VerbosityLevel::Medium, "Choosing from the ready list:\n");
+        for (InstructionNode *N : ReadyList) {
+          PrintInstructionDumpLevel(VerbosityLevel::Medium, N->I);
+        }
+
         // Filter ReadyList so that only if the instruction is Handicapped
         // It will remain only if the current regpressure is lower that the Handicapped value
         InstNodePtrList FilteredReadyList;
@@ -2174,6 +2396,7 @@ class BBScheduler {
         }
 
         FilteredReadyList = filterOutNotReadyRematInstructions(FilteredReadyList);
+        FilteredReadyList = filterOutNotReadyIcmp(FilteredReadyList);
 
         IGC_ASSERT(FilteredReadyList.size() > 0);
 
@@ -2189,6 +2412,9 @@ class BBScheduler {
           // regpressure, if several, choose the one with the least OriginalPosition
           FilteredReadyList = getMaxWeightNodes(FilteredReadyList);
           FilteredReadyList = getLowestRegpressureNodes(FilteredReadyList);
+          if (C[Option::FocusLoadsOnOneDPAS]) {
+            FilteredReadyList = focusLoadsOnOneDPAS(FilteredReadyList);
+          }
           Node = getFirstNode(FilteredReadyList);
           bool IsRegpressureCritical = RT.isRegpressureCritical(Node->I);
           CanClone = RT.isRegpressureHigh(Node->I) || isLargeLoad(Node->I);
@@ -2208,6 +2434,9 @@ class BBScheduler {
             FilteredReadyList = getLargeBlockLoadsIfExist(FilteredReadyList);
           }
 
+          if (C[Option::PrioritizeMaxnumWaveallHighRP]) {
+            FilteredReadyList = getMaxNumWaveAll(FilteredReadyList);
+          }
           if (C[Option::PrioritizeDPASHighRP]) {
             // Experimental heuristic: prioritize DPAS and the instructions that make it possible to
             // schedule DPAS earlier
@@ -2219,8 +2448,16 @@ class BBScheduler {
             FilteredReadyList = getLoadsThatUnlockDPASes(FilteredReadyList,
                                                          C[Option::PrioritizeLoadsThatUnlockDPASesHighRP_MaxLoadSize]);
           }
+          if (C[Option::PrioritizePopulatingOneVectorHighRP]) {
+            FilteredReadyList = filterOutNotUnblockingExistingVectorInst(FilteredReadyList);
+          }
 
           FilteredReadyList = getLowestRegpressureNodes(FilteredReadyList);
+
+          if (C[Option::FocusLoadsOnOneDPAS]) {
+            FilteredReadyList = focusLoadsOnOneDPAS(FilteredReadyList);
+          }
+
           // If we have several nodes with the same regpressure, choose the one with the highest MaxWeight
           FilteredReadyList = getMaxWeightNodes(FilteredReadyList, C[Option::UseHighRPWeight] == 1);
 
diff --git a/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h b/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h
index 6f4ba39dc449..f7f66989e113 100644
--- a/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h
+++ b/IGC/Compiler/CISACodeGen/CodeSchedulingOptionsDef.h
@@ -32,6 +32,8 @@ DECLARE_SCHEDULING_OPTION(WeightDPASDstDepHighRP, 6000,
                           "Edge weight for DPAS destination dependency under high register pressure")
 DECLARE_SCHEDULING_OPTION(WeightExtendedMathDstDep, 200, "Edge weight for extended math destination dependency")
 DECLARE_SCHEDULING_OPTION(WeightWaveAllDstDep, 10, "Edge weight for wave all destination dependency")
+DECLARE_SCHEDULING_OPTION(WeightWaveAllDstDepHighRP, 20, "Edge weight for wave all destination dependency under high "
+                          "register pressure")
 DECLARE_SCHEDULING_OPTION(WeightUnknownMemoryReadDstDep, 500,
                           "Edge weight for unknown memory read destination dependency")
 DECLARE_SCHEDULING_OPTION(WeightUnknownVectorShuffleDstDep, 50,
@@ -64,6 +66,22 @@ DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP, 1,
 DECLARE_SCHEDULING_OPTION(PrioritizeLoadsThatUnlockDPASesHighRP_MaxLoadSize, 32,
                           "Heuristic: Maximum load size (in number of elements) to consider for "
                           "prioritizing loads that unlock DPAS instructions")
+DECLARE_SCHEDULING_OPTION(FocusLoadsOnOneDPAS, 1,
+                          "Heuristic: Focus loads on one DPAS instruction in case we have to choose from "
+                          "many loads")
+DECLARE_SCHEDULING_OPTION(AllowLargerRPWindowRPThreshold, 200,
+                          "Heuristic: Allow larger register pressure window if register pressure is higher than "
+                          "a threshold, so allow also the instructions that have not lowest but similar register "
+                          "pressure, the threshold in bytes")
+DECLARE_SCHEDULING_OPTION(AllowLargerRPWindowSize, 64,
+                          "Heuristic: Allow larger register pressure window if register pressure is higher than "
+                          "a threshold, so allow also the instructions that have not lowest but similar register "
+                          "pressure, the size of the window in bytes")
+DECLARE_SCHEDULING_OPTION(PrioritizeMaxnumWaveallHighRP, 0,
+                          "Heuristic: Maxnum and Waveall instructions are prioritized when register pressure is "
+                          "high")
+DECLARE_SCHEDULING_OPTION(PrioritizePopulatingOneVectorHighRP, 1,
+                          "Heuristic: Prioritize populating one vector when register pressure is high")
 
 // RP management control options
 DECLARE_SCHEDULING_OPTION(GreedyRPThresholdDelta, 20, "Threshold delta for greedy register pressure scheduling")
diff --git a/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp b/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp
index 24d016abc1e3..c4407512fb76 100644
--- a/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp
+++ b/IGC/Compiler/CISACodeGen/RematChainsAnalysis.cpp
@@ -43,7 +43,7 @@ static bool hasRematMetadata(llvm::Value *V) {
   return false;
 }
 
-Value *getAddressOperand(llvm::Instruction *I) {
+Value *getRematedOperand(llvm::Instruction *I) {
   if (!I) return nullptr;
 
   // Check if the instruction is a Load or Store and return the address operand
@@ -51,6 +51,9 @@ Value *getAddressOperand(llvm::Instruction *I) {
     return LI->getPointerOperand();
   } else if (auto *SI = dyn_cast<StoreInst>(I)) {
     return SI->getPointerOperand();
+  } else if (auto *SelI = dyn_cast<SelectInst>(I)) {
+    // For SelectInst, return the condition operand
+    return SelI->getCondition();
   }
 
   // If it's not a Load or Store, return nullptr
@@ -69,7 +72,7 @@ RematChainSet getRematChain(Value *V, Instruction *User) {
 
   if (!isa<IntToPtrInst>(I) && !isa<AddrSpaceCastInst>(I)
       && !isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I)
-      && !isa<BinaryOperator>(I) && !isa<UnaryOperator>(I)) {
+      && !isa<BinaryOperator>(I) && !isa<UnaryOperator>(I) && !isa<CmpInst>(I)) {
     return {};
   }
 
@@ -96,15 +99,15 @@ RematChainSet getRematChain(Value *V, Instruction *User) {
 bool RematChainsAnalysis::runOnFunction(llvm::Function &F) {
   for (auto &BB : F) {
     for (Instruction &I : BB) {
-      Value *AddrOperand = getAddressOperand(&I);
-      if (!AddrOperand)
+      Value *Operand = getRematedOperand(&I);
+      if (!Operand)
         continue;
 
-      Instruction *AI = dyn_cast<Instruction>(AddrOperand);
+      Instruction *AI = dyn_cast<Instruction>(Operand);
       if (!AI)
         continue;
 
-      RematChainSet Chain = getRematChain(AddrOperand, &I);
+      RematChainSet Chain = getRematChain(Operand, &I);
 
       if (!Chain.empty()) {
         RematChainPatterns.push_back(std::make_unique<RematChainPattern>(Chain, AI, &I));
diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
index 19cb3dfb9e57..15594de8b2ad 100644
--- a/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
+++ b/IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp
@@ -244,7 +244,8 @@ void AddAnalysisPasses(CodeGenContext &ctx, IGCPassManager &mpm) {
         ctx.m_instrTypes.numInsts >= IGC_GET_FLAG_VALUE(CodeLoopSinkingMinSize)) {
       mpm.add(new CodeLoopSinking());
     }
-    if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER)) {
+    if (IGC_IS_FLAG_DISABLED(DisableCodeScheduling) && (ctx.type == ShaderType::OPENCL_SHADER) &&
+        (ctx.platform.isCoreChildOf(IGFX_XE_HPC_CORE) || ctx.platform.isCoreChildOf(IGFX_XE2_HPG_CORE))) {
       if (IGC_IS_FLAG_DISABLED(CodeSchedulingOnlyRecompilation) || ctx.m_retryManager.AllowCodeScheduling()) {
         mpm.add(new CodeScheduling());
       }
diff --git a/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll b/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll
index 983bd961a46a..8fae2496b9bf 100644
--- a/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll
+++ b/IGC/Compiler/tests/CodeScheduling/remat-inst-handling.ll
@@ -27,22 +27,16 @@ define spir_kernel void @test_remat(ptr addrspace(1) %A, i32 %x) {
 ; CHECK:       entry:
 ; CHECK:         br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK:         [[REMAT1_1:%.*]] = or i32 [[X:%.*]], 10, !remat !0
-; CHECK:         [[REMAT1_2:%.*]] = shl nuw nsw i32 [[REMAT1_1]], 6, !remat !0
-; CHECK:         [[REMAT1_3:%.*]] = or i32 [[REMAT1_2]], 16, !remat !0
-; CHECK:         [[REMAT1_4:%.*]] = or i32 [[REMAT1_3]], [[REMAT1_2]], !remat !0
-; CHECK:         [[REMAT1_5:%.*]] = shl nuw nsw i32 [[REMAT1_4]], 1, !remat !0
-; CHECK:         [[CLONED_1:%.*]] = inttoptr i32 [[REMAT1_5]] to ptr addrspace(3), !remat !0
+; ...
+; CHECK:         [[REMAT1_5:%.*]] = shl nuw nsw i32 [[REMAT1_4:.*]], 1
+; CHECK:         [[CLONED_1:%.*]] = inttoptr i32 [[REMAT1_5]] to ptr addrspace(3)
 ; CHECK:         [[LOAD_1:%.*]] = load <8 x i16>, ptr addrspace(3) [[CLONED_1]], align 2
-
-; CHECK:         [[REMAT2_1:%.*]] = or i32 [[X]], 18
-; CHECK:         [[REMAT2_2:%.*]] = shl nuw nsw i32 [[REMAT2_1]], 6
-; CHECK:         [[REMAT2_3:%.*]] = or i32 [[REMAT2_2]], 16
-; CHECK:         [[REMAT2_4:%.*]] = shl nuw nsw i32 [[REMAT2_3]], 1
+; ...
+; CHECK:         [[REMAT2_4:%.*]] = shl nuw nsw i32 [[REMAT2_3:.*]], 1
 ; CHECK:         [[CLONED_2:%.*]] = inttoptr i32 [[REMAT2_4]] to ptr addrspace(3)
 ; CHECK:         [[LOAD_2:%.*]] = load <8 x i16>, ptr addrspace(3) [[CLONED_2]], align 2
 
-; CHECK:         [[REMAT3_1:%.*]] = or i32 [[X]], 10, !remat !0
+; CHECK:         [[REMAT3_1:%.*]] = or i32 [[X:.*]], 10, !remat !0
 ; CHECK:         [[REMAT3_2:%.*]] = shl nuw nsw i32 [[REMAT3_1]], 6, !remat !0
 ; CHECK:         [[REMAT3_3:%.*]] = or i32 [[REMAT3_2]], 16, !remat !0
 ; CHECK:         [[REMAT3_4:%.*]] = shl nuw nsw i32 [[REMAT3_3]], 1, !remat !0
@@ -108,6 +102,33 @@ bb1:
   ret void
 }
 
+
+define spir_kernel void @test_remat_select(ptr addrspace(1) %A, i32 %x, i32 %z) {
+; CHECK-LABEL: @test_remat_select(
+; CHECK:       bb1:
+; CHECK:         [[DPAS2:%.*]] = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(<8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> undef, i32 1, i32 1, i32 1, i32 1, i1 false)
+; CHECK:         [[REMAT_OR:%.*]] = or i32 [[X:%.*]], 10, !remat !0
+; CHECK:         [[REMAT_ICMP:%.*]] = icmp eq i32 [[REMAT_OR]], 15, !remat !0
+; CHECK:         [[SEL:%.*]] = select i1 [[REMAT_ICMP]], i32 [[X]], i32 [[Z:%.*]]
+; CHECK:         [[ADD:%.*]] = add i32 [[REMAT_OR]], 2000
+; CHECK:         ret void
+;
+entry:
+  br label %bb1
+
+bb1:
+  %remat_or = or i32 %x, 10, !remat !0
+  %remat_icmp = icmp eq i32 %remat_or, 15, !remat !0
+  %sel = select i1 %remat_icmp, i32 %x, i32 %z
+  %add = add i32 %remat_or, 2000
+
+
+  %dpas2 = call <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(
+  <8 x float> zeroinitializer, <8 x i16> undef, <8 x i32> undef,
+  i32 1, i32 1, i32 1, i32 1, i1 false)
+  ret void
+}
+
 declare <8 x float> @llvm.genx.GenISA.sub.group.dpas.v8f32.v8f32.v8i16.v8i32(
   <8 x float>, <8 x i16>, <8 x i32>, i32, i32, i32, i32, i1)
 
diff --git a/IGC/common/igc_flags.h b/IGC/common/igc_flags.h
index bed2dda36fc7..41608f013f81 100644
--- a/IGC/common/igc_flags.h
+++ b/IGC/common/igc_flags.h
@@ -279,7 +279,7 @@ DECLARE_IGC_REGKEY(DWORD, LS_splitThresholdDelta_GRF, 2,
                    "Register pressure must exceed total GRFs by this much for the load splitting to fire up.", false)
 
 // Code Scheduling
-DECLARE_IGC_REGKEY(bool, DisableCodeScheduling, true, "Disable local code scheduling", true)
+DECLARE_IGC_REGKEY(bool, DisableCodeScheduling, false, "Disable local code scheduling", true)
 DECLARE_IGC_REGKEY(bool, CodeSchedulingOnlyRecompilation, false, "Enable code scheduling only on 2nd try", true)
 
 DECLARE_IGC_REGKEY(bool, EnableCodeSchedulingIfNoSpills, false, "Try rescheduling also when there are no spills", true)

From 22dc83aac699cbd7a99ae4dc18b858ccf0063743 Mon Sep 17 00:00:00 2001
From: "Liou, Jhe-Yu" <jhe-yu.liou@intel.com>
Date: Thu, 4 Sep 2025 16:37:30 +0000
Subject: [PATCH 3/5]  Diable PromoteLoopUnrollWithAlloc for OCL

Diable PromoteLoopUnrollWithAlloc for OCL

(cherry picked from commit bab52a98ce97205380fbe9c64c4246d943368cb4)
---
 IGC/Compiler/GenTTI.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/IGC/Compiler/GenTTI.cpp b/IGC/Compiler/GenTTI.cpp
index 4cedf6300685..a9321ff95ecb 100644
--- a/IGC/Compiler/GenTTI.cpp
+++ b/IGC/Compiler/GenTTI.cpp
@@ -359,15 +359,17 @@ void GenIntrinsicsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     if (AllocaFound) {
       // LLVM default only to 10, boost to UnrollMaxCountForAlloca
       UP.MaxIterationsCountToAnalyze = UnrollMaxCountForAlloca;
-      UP.Threshold += ThresholdBoost;
       UP.UpperBound = true;
       UP.Force = UnrollLoopForCodeSizeOnly ? false : true;
 
-      LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold
-                        << " due to Alloca accessed by:");
-      for (const auto &pair : isGEPLoopInduction)
-        LLVM_DEBUG(dbgs() << " " << pair.first->getName());
-      LLVM_DEBUG(dbgs() << " \n");
+      if (ctx->type != ShaderType::OPENCL_SHADER) {
+        UP.Threshold += ThresholdBoost;
+        LLVM_DEBUG(dbgs() << "Increasing L:" << L->getName() << " threshold to " << UP.Threshold
+                          << " due to Alloca accessed by:");
+        for (const auto &pair : isGEPLoopInduction)
+          LLVM_DEBUG(dbgs() << " " << pair.first->getName());
+        LLVM_DEBUG(dbgs() << " \n");
+      }
     }
   }
 

From 300f146e62494c68dd373e35e7436f93e77360e1 Mon Sep 17 00:00:00 2001
From: "Sukhov, Egor" <egor.sukhov@intel.com>
Date: Tue, 2 Sep 2025 09:22:48 +0000
Subject: [PATCH 4/5]  Fix for IGCVectorizer insertpoint

Now for small blocks consisting of 2 special case instructions
PHI & Terminator (BR or RET for example) we return not
firstnonPHI but the last PHI.

(cherry picked from commit 0b9518e0194b68f8f7172caa12dfe03736930bd9)
---
 IGC/Compiler/CISACodeGen/IGCVectorizer.cpp    |  2 +
 .../vectorizer-special-bb-to-insert.ll        | 70 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll

diff --git a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp
index e0b9f1b57917..d49dd16cbf8c 100644
--- a/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp
+++ b/IGC/Compiler/CISACodeGen/IGCVectorizer.cpp
@@ -460,6 +460,8 @@ Instruction *IGCVectorizer::getInsertPointForVector(VecArr &Arr) {
   // if insert point is PHI, shift it to the first nonPHI to be safe
   if (llvm::isa<llvm::PHINode>(InsertPoint))
       InsertPoint = InsertPoint->getParent()->getFirstNonPHI();
+  if (InsertPoint->isTerminator())
+      InsertPoint = InsertPoint->getPrevNonDebugInstruction();
 
   return InsertPoint;
 }
diff --git a/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll b/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll
new file mode 100644
index 000000000000..005a1ed0e31c
--- /dev/null
+++ b/IGC/Compiler/tests/IGCVectorizer/vectorizer-special-bb-to-insert.ll
@@ -0,0 +1,70 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: igc_opt -S  --igc-vectorizer -dce --regkey=VectorizerDepWindowMultiplier=6 < %s 2>&1 | FileCheck %s
+
+; CHECK-LABEL: bb3:
+; CHECK-NEXT: [[PHI:%.*]] = phi float
+; CHECK-NEXT: [[VECTOR_0:%.*]] = insertelement <8 x float> undef, float [[PHI]], i32 0
+; CHECK-NEXT: [[VECTOR_1:%.*]] = insertelement <8 x float> [[VECTOR_0]], float [[PHI]], i32 1
+; CHECK-NEXT: [[VECTOR_2:%.*]] = insertelement <8 x float> [[VECTOR_1]], float [[PHI]], i32 2
+; CHECK-NEXT: [[VECTOR_3:%.*]] = insertelement <8 x float> [[VECTOR_2]], float [[PHI]], i32 3
+; CHECK-NEXT: [[VECTOR_4:%.*]] = insertelement <8 x float> [[VECTOR_3]], float [[PHI]], i32 4
+; CHECK-NEXT: [[VECTOR_5:%.*]] = insertelement <8 x float> [[VECTOR_4]], float [[PHI]], i32 5
+; CHECK-NEXT: [[VECTOR_6:%.*]] = insertelement <8 x float> [[VECTOR_5]], float [[PHI]], i32 6
+; CHECK-NEXT: [[VECTOR_7:%.*]] = insertelement <8 x float> [[VECTOR_6]], float [[PHI]], i32 7
+; CHECK-NEXT: br i1 {{%.*}}, label {{%.*}}, label {{%.*}}
+
+
+define spir_kernel void @barney() {
+bb:
+  %tmp = fcmp une float 0.000000e+00, 0.000000e+00
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 false, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb1
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1
+  %tmp4 = phi float [ 0.000000e+00, %bb1 ], [ 0.000000e+00, %bb2 ]
+  br i1 %tmp, label %bb5, label %bb6
+
+bb5:                                              ; preds = %bb3
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb3
+  %tmp7 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp8 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp9 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp10 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp11 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp12 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp13 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp14 = fmul reassoc nsz arcp contract float 0.000000e+00, %tmp4
+  %tmp15 = insertelement <8 x float> zeroinitializer, float %tmp7, i64 0
+  %tmp16 = insertelement <8 x float> %tmp15, float %tmp8, i64 1
+  %tmp17 = insertelement <8 x float> %tmp16, float %tmp9, i64 2
+  %tmp18 = insertelement <8 x float> %tmp17, float %tmp10, i64 3
+  %tmp19 = insertelement <8 x float> %tmp18, float %tmp11, i64 4
+  %tmp20 = insertelement <8 x float> %tmp19, float %tmp12, i64 5
+  %tmp21 = insertelement <8 x float> %tmp20, float %tmp13, i64 6
+  %tmp22 = insertelement <8 x float> %tmp21, float %tmp14, i64 7
+  %tmp23 = bitcast <8 x float> %tmp22 to <8 x i32>
+  call void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false, i32 0, <8 x i32> %tmp23)
+  ret void
+}
+
+declare void @llvm.genx.GenISA.LSC2DBlockWrite.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, <8 x i32>)
+
+!igc.functions = !{!0}
+
+!0 = distinct !{void ()* @barney, !1}
+!1 = distinct !{!2}
+!2 = distinct !{!"sub_group_size", i32 16}

From bd67908e0b06013e83ee02c9f3ff10cf976ed96a Mon Sep 17 00:00:00 2001
From: Anastasia Bodrova <anastasia.bodrova@intel.com>
Date: Mon, 1 Sep 2025 10:03:34 +0000
Subject: [PATCH 5/5] Changes in code.

(cherry picked from commit cf2dc92ae5d8c0fc0fb70c00079690fbb35cdbcf)
---
 IGC/Compiler/CISACodeGen/DeSSA.cpp            |  2 +-
 .../CISACodeGen/VariableReuseAnalysis.cpp     | 72 +++++++--------
 .../CISACodeGen/VariableReuseAnalysis.hpp     |  4 +-
 .../EmitVISAPass/inline_asm_vectoralias.ll    | 89 -------------------
 4 files changed, 36 insertions(+), 131 deletions(-)
 delete mode 100644 IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll

diff --git a/IGC/Compiler/CISACodeGen/DeSSA.cpp b/IGC/Compiler/CISACodeGen/DeSSA.cpp
index 1f386f0cab84..263e21aea84d 100644
--- a/IGC/Compiler/CISACodeGen/DeSSA.cpp
+++ b/IGC/Compiler/CISACodeGen/DeSSA.cpp
@@ -1542,7 +1542,7 @@ bool DeSSA::isAliasee(Value *V) const {
 //      c = 2
 //      ...
 //   L:   = a
-//        = c
+//        = b
 //
 //  In this case, if a is aliased to b, a would get 2 at L, but the correct
 //  value should be 1. In order to find out if a can be aliased to b, it
diff --git a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp
index 72ca0b44fe28..5165a6bb85a7 100644
--- a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp
+++ b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.cpp
@@ -887,16 +887,16 @@ bool VariableReuseAnalysis::getAllInsEltsIfAvailable(InsertElementInst *FirstIEI
     IGC_ASSERT_MESSAGE(IEI_ix < nelts, "ICE: IEI's index out of bound!");
     SVecInsEltInfo &InsEltInfo = AllIEIs[IEI_ix];
     if (InsEltInfo.IEI) {
-      // This element is inserted more than once, skip.
+      // One element is inserted more than once, skip.
       return false;
     }
     InsEltInfo.IEI = I;
     InsEltInfo.Elt = E;
     InsEltInfo.FromVec = V;
     InsEltInfo.FromVec_eltIx = V_ix;
-
-    // So far, E is never nullptr (could be in the future)
-    InsEltInfo.EEI = dyn_cast_or_null<ExtractElementInst>(E);
+    if (E) {
+      InsEltInfo.EEI = dyn_cast<ExtractElementInst>(E);
+    }
 
     if (!I->hasOneUse()) {
       break;
@@ -923,24 +923,19 @@ bool VariableReuseAnalysis::getAllInsEltsIfAvailable(InsertElementInst *FirstIEI
     if (tV == nullptr)
       return false;
 
-    // Expect all IEIs are in the same DeSSA CC (DeSSA special-handles IEIs)
+    // Expect node values for all IEIs are identical. In general, if they
+    // are in the same DeSSA CC, that would be fine.
     Value *tV_nv = m_DeSSA->getNodeValue(tV);
     if (V_root != getRootValue(tV_nv))
       return false;
 
     Value *E = AllIEIs[i].Elt;
-    if (!E || isa<Constant>(E)) {
-      // constant is okay for either non-uniform or uniform.
-      continue;
-    }
     Value *FromVec = AllIEIs[i].FromVec;
-    if (FromVec) {
-      Value *FromVec_nv = m_DeSSA->getNodeValue(FromVec);
-      // check if FromVec has been coalesced with IEI already by DeSSA.
-      // (Wouldn't happen under current DeSSA, but might happen in future)
-      if (V_root == getRootValue(FromVec_nv))
-        return false;
-    }
+    Value *FromVec_nv = m_DeSSA->getNodeValue(FromVec);
+    // check if FromVec has been coalesced with IEI already by DeSSA.
+    // (Wouldn't happen under current DeSSA, but might happen in future)
+    if (V_root == getRootValue(FromVec_nv))
+      return false;
 
     // Make sure FromVec or E have the same uniformness as V.
     if ((E && V_dep != m_WIA->whichDepend(E)) || (FromVec && V_dep != m_WIA->whichDepend(FromVec)))
@@ -974,13 +969,17 @@ Value *VariableReuseAnalysis::traceAliasValue(Value *V) {
 }
 
 //
-// Returns true if there is the following pattern; otherwise return false.
+// Returns true if the following is true
 //     IEI = insertElement  <vectorType> Vec,  S,  <constant IEI_ix>
-//   1. S is from another vector V.
-//      S = extractElement <vectorType> V, <constant V_ix>
-//      In this case, S is the element denoted by (V, V_ix)
-//   2. otherwise, V=nullptr, V_ix=0.
-//      S is a candidate and could be alias to the vector.
+// Return false, otherwise.
+//
+// When the above condition is true, V and V_ix are used for the
+// following cases:
+//     1. S is from another vector V.
+//        S = extractElement <vectorType> V, <constant V_ix>
+//        S is the element denoted by (V, V_ix)
+//     2. otherwise, V=nullptr, V_ix=0.
+//        S is a candidate inserted and could be alias to the vector.
 //
 //  Input: IEI
 //  Output: IEI_ix, S, V, V_ix
@@ -1000,9 +999,9 @@ bool VariableReuseAnalysis::getElementValue(InsertElementInst *IEI, int &IEI_ix,
   IEI_ix = (int)CI->getZExtValue();
 
   Value *elem0 = IEI->getOperand(1);
-  if (hasBeenPayloadCoalesced(elem0) || isOrCoalescedWithArg(elem0)) {
-    // If elem0 has been payload-coalesced or it has been aliased to
-    // an argument, skip it.
+  if (hasBeenPayloadCoalesced(elem0) || isa<Constant>(elem0) || isOrCoalescedWithArg(elem0)) {
+    // If elem0 has been payload-coalesced, is constant,
+    // or it has been aliased to an argument, skip it.
     return false;
   }
 
@@ -1047,10 +1046,11 @@ void VariableReuseAnalysis::InsertElementAliasing(Function *F) {
 
   // IGC Key VectorAlias controls vectorAlias optimiation.
   //
-  // VectorAlias (also from m_pCtx->getVectorCoalescingControl())
-  //   0x0: disable vector aliasing
-  //   0x1: subvec aliasing for isolated values (getRootValue()=null)
-  //   0x2: subvec aliasing for both isolated and non-isolated value)
+  // Do it if VectorAlias != 0.
+  // VectorAlias=0x1: subvec aliasing for isolated values
+  // (getRootValue()=null)
+  //            =0x2: subvec aliasing for both isolated and non-isolated
+  //            value)
   const auto control = (m_pCtx->getVectorCoalescingControl() & 0x3);
   // To avoid increasing GRF pressure, skip if F is too large or not an entry
   const int32_t NumBBThreshold = IGC_GET_FLAG_VALUE(VectorAliasBBThreshold);
@@ -1253,7 +1253,6 @@ bool VariableReuseAnalysis::processInsertTo(BasicBlock *BB, VecInsEltInfoTy &All
       isSubCandidate = false;
     }
 
-    // So far, Elt is never nullptr (could be in the future)
     if (Elt && Sub == nullptr && skipScalarAliaser(BB, Elt)) {
       // Skip scalar coalescing
       isSubCandidate = false;
@@ -1434,11 +1433,8 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateUse(Value
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(Val)) {
       retSt = AState::TARGET;
-    } else if (CallInst *CallI = dyn_cast<CallInst>(Val)) {
-      if (CallI->isInlineAsm())
-        retSt = AState::TARGET;
-      else
-        return AState::SKIP;
+    } else if (isa<CallInst>(Val)) {
+      return AState::SKIP;
     }
   }
   return retSt;
@@ -1464,9 +1460,7 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateDef(Value
     }
   } else if (LoadInst *SI = dyn_cast<LoadInst>(Val)) {
     return AState::TARGET;
-  } else if (CallInst *CallI = dyn_cast<CallInst>(Val)) {
-    if (CallI->isInlineAsm())
-      return AState::TARGET;
+  } else if (isa<CallInst>(Val)) {
     return AState::SKIP;
   }
   return AState::OK;
@@ -1474,7 +1468,7 @@ VariableReuseAnalysis::AState VariableReuseAnalysis::getCandidateStateDef(Value
 
 // Vector alias disables extractMask optimization. This function
 // checks if extractMask optim can be applied. And the caller
-// will decide whether to favor extractMask optimization or not.
+// will decide whether to favor extractMask optimization.
 bool VariableReuseAnalysis::isExtractMaskCandidate(Value *V) const {
   auto BIT = [](int n) { return (uint32_t)(1 << n); };
 
diff --git a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp
index 612ca1df376a..2ade6f51d087 100644
--- a/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp
+++ b/IGC/Compiler/CISACodeGen/VariableReuseAnalysis.hpp
@@ -131,8 +131,8 @@ struct SVecInsEltInfo {
   llvm::InsertElementInst *IEI;
   llvm::Value *Elt;
 
-  // EEI, if not nullptr, is used as scalar operand of IEI and is the same as
-  // (FromVec, FromVec_eltIx).
+  // If Elt is null, EEI must not be null. EEI is used as scalar operand
+  // in IEI and is the same as (FromVec, FromVec_eltIx).
   llvm::ExtractElementInst *EEI;
   llvm::Value *FromVec;
   int FromVec_eltIx;
diff --git a/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll b/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll
deleted file mode 100644
index ea12ea04c0f3..000000000000
--- a/IGC/Compiler/tests/EmitVISAPass/inline_asm_vectoralias.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-;=========================== begin_copyright_notice ============================
-;
-; Copyright (C) 2023 Intel Corporation
-;
-; SPDX-License-Identifier: MIT
-;
-;============================ end_copyright_notice =============================
-
-; To vector alias on inline asm
-
-; REQUIRES: llvm-14-plus, regkeys
-
-; RUN: igc_opt --opaque-pointers --CheckInstrTypes --igc-update-instrtypes-on-run -inputocl --neo \
-; RUN:         -platformpvc -igc-emit-visa -regkey DumpVISAASMToConsole,VectorAlias=1 -simd-mode 16 %s  \
-; RUN: | FileCheck %s
-
-; CHECK-LABEL: .function
-; CHECK: lsc_load_block2d.ugm (M1, 1)  [[INPUT:.*]]:d8.16x8nt  flat[{{.+}}]
-; CHECK: mov (M1_NM, 16) [[OUTPUT:.*]](0,0)<2> [[INPUT]](0,0)<4;1,0>
-; CHECK: mov (M1_NM, 16) [[OUTPUT]](0,1)<2> [[INPUT]](0,1)<4;1,0>
-; CHECK: mov (M1_NM, 16) [[OUTPUT]](1,0)<2> [[INPUT]](0,2)<4;1,0>
-; CHECK: mov (M1_NM, 16) [[OUTPUT]](1,1)<2> [[INPUT]](0,3)<4;1,0>
-; CHECK: mov (M1, 16) [[OUTPUT]](2,0)<1> 0x0:w
-; CHECK: mov (M1, 16) [[OUTPUT]](2,16)<1> 0x0:w
-; CHECK: mov (M1, 16) [[OUTPUT]](3,0)<1> 0x0:w
-; CHECK: mov (M1, 16) [[OUTPUT]](3,16)<1> 0x0:w
-; CHECK: lsc_store_block2d.ugm (M1, 1)  flat[{{.+}}]  [[OUTPUT]]:d16.16x8nn
-; CHECK: ret (M1, 1)
-
-; Function Attrs: convergent nounwind null_pointer_is_valid
-define spir_kernel void @test(i8 addrspace(1)* align 1 %a, i16 addrspace(1)* align 2 %b, <8 x i32> %r0, <8 x i32> %payloadHeader, i8 addrspace(2)* %constBase, i32 %bufferOffset, i32 %bufferOffset1) {
-entry:
-  %0 = call <8 x i8> asm "lsc_load_block2d.ugm (M1, 1) $0:d8.16x8nt flat[$1,15,15,15,0,0]", "=rw,rw.u"(i8 addrspace(1)* %a)
-  %1 = extractelement <8 x i8> %0, i32 0
-  %2 = insertelement <4 x i8> undef, i8 %1, i32 0
-  %3 = extractelement <8 x i8> %0, i32 1
-  %4 = insertelement <4 x i8> %2, i8 %3, i32 1
-  %5 = extractelement <8 x i8> %0, i32 2
-  %6 = insertelement <4 x i8> %4, i8 %5, i32 2
-  %7 = extractelement <8 x i8> %0, i32 3
-  %8 = insertelement <4 x i8> %6, i8 %7, i32 3
-  %9 = call <4 x i16> asm "mov (M1_NM, 16) $0(0,0)<2> $1(0,0)<4;1,0>\0Amov (M1_NM, 16) $0(0,1)<2> $1(0,1)<4;1,0>\0Amov (M1_NM, 16) $0(1,0)<2> $1(0,2)<4;1,0>\0Amov (M1_NM, 16) $0(1,1)<2> $1(0,3)<4;1,0>\0A", "=rw,rw"(<4 x i8> %8)
-  %10 = extractelement <4 x i16> %9, i32 0
-  %11 = extractelement <4 x i16> %9, i32 1
-  %12 = extractelement <4 x i16> %9, i32 2
-  %13 = extractelement <4 x i16> %9, i32 3
-  %14 = insertelement <8 x i16> undef, i16 %10, i32 0
-  %15 = insertelement <8 x i16> %14, i16 %11, i32 1
-  %16 = insertelement <8 x i16> %15, i16 %12, i32 2
-  %17 = insertelement <8 x i16> %16, i16 %13, i32 3
-  %18 = insertelement <8 x i16> %17, i16 0, i32 4
-  %19 = insertelement <8 x i16> %18, i16 0, i32 5
-  %20 = insertelement <8 x i16> %19, i16 0, i32 6
-  %21 = insertelement <8 x i16> %20, i16 0, i32 7
-  call void asm sideeffect "lsc_store_block2d.ugm (M1, 1) flat[$1,15,15,15,0,0] $0:d16.16x8nn", "rw,rw.u"(<8 x i16> %21, i16 addrspace(1)* %b)
-  ret void
-}
-
-
-!igc.functions = !{!0}
-!IGCMetadata = !{!13}
-
-!0 = !{void (i8 addrspace(1)*, i16 addrspace(1)*, <8 x i32>, <8 x i32>, i8 addrspace(2)*, i32, i32)* @test, !1}
-!1 = !{!2, !3}
-!2 = !{!"function_type", i32 0}
-!3 = !{!"sub_group_size", i32 16}
-!13 = !{!"ModuleMD", !14}
-!14 = !{!"FuncMD", !15, !16}
-!15 = !{!"FuncMDMap[0]", void (i8 addrspace(1)*, i16 addrspace(1)*, <8 x i32>, <8 x i32>, i8 addrspace(2)*, i32, i32)* @test}
-!16 = !{!"FuncMDValue[0]", !100, !226}
-!100 = !{!"resAllocMD", !183, !184, !185, !186}
-!183 = !{!"uavsNumType", i32 0}
-!184 = !{!"srvsNumType", i32 0}
-!185 = !{!"samplersNumType", i32 0}
-!186 = !{!"argAllocMDList", !187, !191, !192, !193, !194, !195, !196}
-!187 = !{!"argAllocMDListVec[0]", !188, !189, !190}
-!188 = !{!"type", i32 0}
-!189 = !{!"extensionType", i32 -1}
-!190 = !{!"indexType", i32 -1}
-!191 = !{!"argAllocMDListVec[1]", !188, !189, !190}
-!192 = !{!"argAllocMDListVec[2]", !188, !189, !190}
-!193 = !{!"argAllocMDListVec[3]", !188, !189, !190}
-!194 = !{!"argAllocMDListVec[4]", !188, !189, !190}
-!195 = !{!"argAllocMDListVec[5]", !188, !189, !190}
-!196 = !{!"argAllocMDListVec[6]", !188, !189, !190}
-!226 = !{!"m_OpenCLArgTypeQualifiers", !227, !228}
-!227 = !{!"m_OpenCLArgTypeQualifiersVec[0]", !""}
-!228 = !{!"m_OpenCLArgTypeQualifiersVec[1]", !""}
-