fix sigmoid implementation stability in pim-simulator

fix weightAlways attribute in spatial
multiple-output spat computes
2026-04-23 10:34:29 +02:00 · 2026-04-23 10:04:47 +02:00 · 2026-04-23 09:28:57 +02:00
9 changed files with 123 additions and 79 deletions
@@ -55,15 +55,23 @@ pub trait HasSigm {

 impl HasSigm for f32 {
    fn sigm(self) -> Self {
-        let ex = self.exp();
-        ex / (1.0 + ex)
+        if self >= 0.0 {
+            1.0 / (1.0 + (-self).exp())
+        } else {
+            let ex = self.exp();
+            ex / (1.0 + ex)
+        }
    }
 }

 impl HasSigm for f64 {
    fn sigm(self) -> Self {
-        let ex = self.exp();
-        ex / (1.0 + ex)
+        if self >= 0.0 {
+            1.0 / (1.0 + (-self).exp())
+        } else {
+            let ex = self.exp();
+            ex / (1.0 + ex)
+        }
    }
 }

@@ -4,6 +4,7 @@
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"

+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/raw_os_ostream.h"

 #include <filesystem>
@@ -96,6 +97,53 @@ void markWeightAlways(Operation* op) {
  op->setAttr(PimWeightAlwaysAttrName, UnitAttr::get(op->getContext()));
 }

+namespace {
+
+template <typename MVMOpTy, typename VMMOpTy, typename ParentOpTy>
+bool hasMvmVmmWeightUse(ParentOpTy parentOp, unsigned weightIndex) {
+  bool found = false;
+  parentOp.walk([&](Operation* op) {
+    if (auto mvmOp = dyn_cast<MVMOpTy>(op))
+      found |= mvmOp.getWeightIndex() == weightIndex;
+    else if (auto vmmOp = dyn_cast<VMMOpTy>(op))
+      found |= vmmOp.getWeightIndex() == weightIndex;
+  });
+  return found;
+}
+
+template <typename MVMOpTy, typename VMMOpTy, typename ParentOpTy>
+void walkMvmVmmWeightUses(ParentOpTy parentOp, function_ref<void(OpOperand&)> callback) {
+  auto weights = parentOp.getWeights();
+  llvm::SmallSet<unsigned, 8> visited;
+  auto walkWeightIndex = [&](unsigned weightIndex) {
+    if (weightIndex < weights.size() && visited.insert(weightIndex).second)
+      callback(parentOp->getOpOperand(weightIndex));
+  };
+
+  parentOp.walk([&](MVMOpTy op) { walkWeightIndex(op.getWeightIndex()); });
+  parentOp.walk([&](VMMOpTy op) { walkWeightIndex(op.getWeightIndex()); });
+}
+
+} // namespace
+
+bool isSpatialMvmVmmWeightUse(OpOperand& use) {
+  Operation* user = use.getOwner();
+  unsigned operandIndex = use.getOperandNumber();
+
+  auto computeOp = dyn_cast<spatial::SpatCompute>(user);
+  if (!computeOp || operandIndex >= computeOp.getWeights().size())
+    return false;
+
+  return hasMvmVmmWeightUse<spatial::SpatWeightedMVMOp, spatial::SpatWeightedVMMOp>(computeOp, operandIndex);
+}
+
+void walkPimMvmVmmWeightUses(Operation* root, function_ref<void(OpOperand&)> callback) {
+  assert(root && "expected valid root op");
+  root->walk([&](pim::PimCoreOp coreOp) {
+    walkMvmVmmWeightUses<pim::PimMVMOp, pim::PimVMMOp>(coreOp, callback);
+  });
+}
+
 memref::GlobalOp lookupGlobalForGetGlobal(ModuleOp moduleOp, memref::GetGlobalOp getGlobalOp) {
  if (!moduleOp || !getGlobalOp)
    return {};
@@ -7,6 +7,7 @@
 #include "mlir/IR/Value.h"

 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"

@@ -40,6 +41,10 @@ bool hasWeightAlways(mlir::Operation* op);

 void markWeightAlways(mlir::Operation* op);

+bool isSpatialMvmVmmWeightUse(mlir::OpOperand& use);
+
+void walkPimMvmVmmWeightUses(mlir::Operation* root, llvm::function_ref<void(mlir::OpOperand&)> callback);
+
 mlir::memref::GlobalOp lookupGlobalForGetGlobal(mlir::ModuleOp moduleOp, mlir::memref::GetGlobalOp getGlobalOp);

 llvm::FailureOr<mlir::Operation*>
@@ -392,7 +392,9 @@ void ONNXToSpatialPass::mergeTriviallyConnectedComputes(func::FuncOp funcOp) {
 void ONNXToSpatialPass::annotateWeightsConstants(func::FuncOp funcOp) const {
  funcOp.walk([&](arith::ConstantOp constantOp) {
    bool isAlwaysWeight =
-      llvm::all_of(constantOp->getUsers(), [](auto user) -> bool { return isa<spatial::SpatCompute>(user); });
+      !constantOp->use_empty() && llvm::all_of(constantOp->getUses(), [](OpOperand& use) -> bool {
+        return isSpatialMvmVmmWeightUse(use);
+      });
    if (isAlwaysWeight)
      markWeightAlways(constantOp);
  });
@@ -289,8 +289,7 @@ static SmallVector<Value> createIm2colRowComputes(Value x,
    rowResults.reserve(packedNumRows);
    for (int64_t rowIdx = 0; rowIdx < packedNumRows; rowIdx++) {
      SmallVector<OpFoldResult> offsets = {rewriter.getIndexAttr(rowIdx), rewriter.getIndexAttr(0)};
-      SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
-                                         rewriter.getIndexAttr(packFactor * patchSize)};
+      SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(packFactor * patchSize)};
      SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
      rowResults.push_back(
        tensor::ExtractSliceOp::create(rewriter, loc, gemmInputRowType, gemmInputRows, offsets, sizes, strides));
@@ -326,10 +325,9 @@ static Value createCollectedConvOutput(ValueRange gemmRows,
    else {
      auto expandedType = RankedTensorType::get({packedNumRows, packFactor, numChannelsOut}, outType.getElementType());
      auto paddedType = RankedTensorType::get({paddedNumPatches, numChannelsOut}, outType.getElementType());
-      Value packedOutput =
-        gemmRowArgs.size() == 1
-          ? gemmRowArgs.front()
-          : tensor::ConcatOp::create(rewriter, loc, /*axis=*/0, gemmRowArgs).getResult();
+      Value packedOutput = gemmRowArgs.size() == 1
+                           ? gemmRowArgs.front()
+                           : tensor::ConcatOp::create(rewriter, loc, /*axis=*/0, gemmRowArgs).getResult();
      Value expandedOutput = tensor::ExpandShapeOp::create(rewriter,
                                                           loc,
                                                           expandedType,
@@ -505,38 +503,41 @@ LogicalResult ConvToGemm::matchAndRewrite(ONNXConvOp convOp,
  //   B (weights): [patchSize, cOut]        -- W^T, stored in crossbar columns
  // and optionally repack several old rows into one GEMM row to use the available crossbar size better.
  //
-  // The im2col compute yields each GEMM input row as a separate result so every GEMM consumes only
-  // the row it needs instead of receiving a full packed tensor and slicing it locally.
-  auto gemmInputRowType =
-    RankedTensorType::get({1, effectiveMaxParallelPixels * patchSize}, elemType);
+  // We want to process N pixels at the same time. Instead of doing N separate operations
+  // of (1 x patchSize) x (patchSize x cOut), we construct a block-diagonal weight matrix
+  // containing N copies of W^T and concatenate N im2col rows into one longer row:
+  //   A_packed: [ceil(numPatches / N), N * patchSize]
+  //   B_packed: [N * patchSize, N * cOut]
+  //   Y_packed: [ceil(numPatches / N), N * cOut]
+  auto gemmInputRowType = RankedTensorType::get({1, effectiveMaxParallelPixels * patchSize}, elemType);
  auto gemmOutputRowType =
    RankedTensorType::get({1, effectiveMaxParallelPixels * numChannelsOut}, outType.getElementType());
  SmallVector<Value> gemmInputRows = createIm2colRowComputes(x,
-                                                            xType,
-                                                            im2colType,
-                                                            rowType,
-                                                            gemmInputRowType,
-                                                            batchSize,
-                                                            numChannelsIn,
-                                                            xHeight,
-                                                            xWidth,
-                                                            wHeight,
-                                                            wWidth,
-                                                            padHeightBegin,
-                                                            padHeightEnd,
-                                                            padWidthBegin,
-                                                            padWidthEnd,
-                                                            strideHeight,
-                                                            strideWidth,
-                                                            dilationHeight,
-                                                            dilationWidth,
-                                                            outWidth,
-                                                            patchSize,
-                                                            numPatches,
-                                                            numPatchesPerBatch,
-                                                            effectiveMaxParallelPixels,
-                                                            rewriter,
-                                                            loc);
+                                                             xType,
+                                                             im2colType,
+                                                             rowType,
+                                                             gemmInputRowType,
+                                                             batchSize,
+                                                             numChannelsIn,
+                                                             xHeight,
+                                                             xWidth,
+                                                             wHeight,
+                                                             wWidth,
+                                                             padHeightBegin,
+                                                             padHeightEnd,
+                                                             padWidthBegin,
+                                                             padWidthEnd,
+                                                             strideHeight,
+                                                             strideWidth,
+                                                             dilationHeight,
+                                                             dilationWidth,
+                                                             outWidth,
+                                                             patchSize,
+                                                             numPatches,
+                                                             numPatchesPerBatch,
+                                                             effectiveMaxParallelPixels,
+                                                             rewriter,
+                                                             loc);

  Value gemmB = buildPackedWeight(wDenseAttr,
                                  wTrans,
@@ -94,10 +94,8 @@ void PimBufferizationPass::runOnOperation() {

 void PimBufferizationPass::annotateWeightsMemrefs(ModuleOp moduleOp, func::FuncOp funcOp) const {
  funcOp.walk([&](PimCoreOp coreOp) {
-    auto annotateWeight = [&](unsigned weightIndex) {
-      if (weightIndex >= coreOp.getWeights().size())
-        return;
-      Value weight = coreOp.getWeights()[weightIndex];
+    walkPimMvmVmmWeightUses(coreOp, [&](OpOperand& weightUse) {
+      Value weight = weightUse.get();
      auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
      if (!getGlobalOp)
        return;
@@ -105,10 +103,7 @@ void PimBufferizationPass::annotateWeightsMemrefs(ModuleOp moduleOp, func::FuncO
      assert("Weights must be constants" && globalMemrefOp.getConstant());
      markWeightAlways(getGlobalOp);
      markWeightAlways(globalMemrefOp);
-    };
-
-    coreOp.walk([&](PimMVMOp mvmOp) { annotateWeight(mvmOp.getWeightIndex()); });
-    coreOp.walk([&](PimVMMOp vmmOp) { annotateWeight(vmmOp.getWeightIndex()); });
+    });
  });
 }

@@ -1,5 +1,4 @@
 #include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Traits.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -14,10 +13,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
-#include "mlir/Support/LogicalResult.h"

-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/LogicalResult.h"

@@ -119,13 +115,10 @@ inline LogicalResult mvmOpVerifySize4(SpatWeightedMVMOp* emitter,
 }

 llvm::FailureOr<ArrayRef<int64_t>> getWeightShapeForWeightedOp(Operation* weigthedOp, size_t weightIndex) {
-  auto wcomputeOp = dyn_cast<SpatCompute>(weigthedOp->getParentOp());
-  if (wcomputeOp)
-    return cast<ShapedType>(wcomputeOp.getWeights()[weightIndex].getType()).getShape();
+  if (auto computeOp = dyn_cast<SpatCompute>(weigthedOp->getParentOp()))
+    return cast<ShapedType>(computeOp.getWeights()[weightIndex].getType()).getShape();

-  auto coreOp = dyn_cast<pim::PimCoreOp>(weigthedOp->getParentOp());
-
-  if (coreOp)
+  if (auto coreOp = dyn_cast<pim::PimCoreOp>(weigthedOp->getParentOp()))
    return cast<ShapedType>(coreOp.getWeights()[weightIndex].getType()).getShape();

  return failure();
@@ -28,7 +28,7 @@ using namespace mlir;
 namespace {

 struct VirtualNode {
-  llvm::SmallVector<size_t, 4> originalComputeIndices;
+  SmallVector<size_t, 4> originalComputeIndices;
  Weight weight = 0;
  CrossbarUsage crossbarUsage = 0;
 };
@@ -50,7 +50,7 @@ struct WindowScheduleResult {
  bool usedAllAvailableCpus = false;
 };

-std::vector<IndexedEdge> aggregateEdges(llvm::ArrayRef<IndexedEdge> edges) {
+std::vector<IndexedEdge> aggregateEdges(ArrayRef<IndexedEdge> edges) {
  std::map<std::pair<size_t, size_t>, Weight> edgeWeights;
  for (auto [start, end, weight] : edges) {
    size_t startIndex = static_cast<size_t>(start);
@@ -74,8 +74,7 @@ std::vector<IndexedEdge> aggregateEdges(llvm::ArrayRef<IndexedEdge> edges) {
  return aggregatedEdges;
 }

-VirtualGraph buildInitialVirtualGraph(llvm::ArrayRef<SpatCompute> spatComputes,
-                                      llvm::ArrayRef<IndexedEdge> edges) {
+VirtualGraph buildInitialVirtualGraph(ArrayRef<SpatCompute> spatComputes, ArrayRef<IndexedEdge> edges) {
  VirtualGraph graph;
  graph.nodes.reserve(spatComputes.size());
  for (auto [index, spatCompute] : llvm::enumerate(spatComputes)) {
@@ -174,7 +173,7 @@ std::vector<size_t> selectCriticalWindow(const TimingInfo& timing, size_t window
  return selected;
 }

-std::vector<size_t> getOriginalSignature(const VirtualGraph& graph, llvm::ArrayRef<size_t> selectedNodes) {
+std::vector<size_t> getOriginalSignature(const VirtualGraph& graph, ArrayRef<size_t> selectedNodes) {
  std::vector<size_t> signature;
  for (size_t nodeIndex : selectedNodes) {
    const VirtualNode& node = graph.nodes[nodeIndex];
@@ -197,8 +196,7 @@ std::vector<IndexedEdge> buildWindowEdges(const VirtualGraph& graph, const std::
  return aggregateEdges(windowEdges);
 }

-WindowScheduleResult
-scheduleWindow(const VirtualGraph& graph, llvm::ArrayRef<size_t> selectedNodes, MLIRContext* context) {
+WindowScheduleResult scheduleWindow(const VirtualGraph& graph, ArrayRef<size_t> selectedNodes, MLIRContext* context) {
  std::vector<Weight> windowWeights;
  std::vector<CrossbarUsage> windowCrossbarUsage;
  std::vector<int64_t> nodeToWindowIndex(graph.nodes.size(), -1);
@@ -234,9 +232,7 @@ scheduleWindow(const VirtualGraph& graph, llvm::ArrayRef<size_t> selectedNodes,
  return result;
 }

-bool coarsenGraph(const VirtualGraph& graph,
-                  llvm::ArrayRef<std::vector<size_t>> mergeGroups,
-                  VirtualGraph& coarsenedGraph) {
+bool coarsenGraph(const VirtualGraph& graph, ArrayRef<std::vector<size_t>> mergeGroups, VirtualGraph& coarsenedGraph) {
  std::vector<int64_t> nodeToMergeGroup(graph.nodes.size(), -1);
  for (auto [groupIndex, mergeGroup] : llvm::enumerate(mergeGroups)) {
    if (mergeGroup.size() < 2)
@@ -303,7 +299,7 @@ bool coarsenGraph(const VirtualGraph& graph,
 }

 bool coarsenGraphWithFallback(const VirtualGraph& graph,
-                              llvm::ArrayRef<std::vector<size_t>> mergeGroups,
+                              ArrayRef<std::vector<size_t>> mergeGroups,
                              VirtualGraph& coarsenedGraph) {
  if (coarsenGraph(graph, mergeGroups, coarsenedGraph))
    return true;
@@ -330,7 +326,7 @@ bool coarsenGraphWithFallback(const VirtualGraph& graph,
  return !acceptedMergeGroups.empty();
 }

-std::vector<size_t> computeOriginalTopologicalOrder(size_t computeCount, llvm::ArrayRef<IndexedEdge> edges) {
+std::vector<size_t> computeOriginalTopologicalOrder(size_t computeCount, ArrayRef<IndexedEdge> edges) {
  VirtualGraph graph;
  graph.nodes.resize(computeCount);
  graph.edges = aggregateEdges(edges);
@@ -344,8 +340,8 @@ std::vector<size_t> computeOriginalTopologicalOrder(size_t computeCount, llvm::A
 }

 DCPAnalysisResult buildResultFromVirtualGraph(const VirtualGraph& graph,
-                                              llvm::ArrayRef<SpatCompute> spatComputes,
-                                              llvm::ArrayRef<IndexedEdge> originalEdges) {
+                                              ArrayRef<SpatCompute> spatComputes,
+                                              ArrayRef<IndexedEdge> originalEdges) {
  DCPAnalysisResult result;
  std::vector<size_t> originalToVirtualNode(spatComputes.size(), 0);
  for (auto [virtualNodeIndex, virtualNode] : llvm::enumerate(graph.nodes))
@@ -367,9 +363,7 @@ DCPAnalysisResult buildResultFromVirtualGraph(const VirtualGraph& graph,
  return result;
 }

-DCPAnalysisResult runLegacyDcp(llvm::ArrayRef<SpatCompute> spatComputes,
-                               llvm::ArrayRef<IndexedEdge> edges,
-                               MLIRContext* context) {
+DCPAnalysisResult runLegacyDcp(ArrayRef<SpatCompute> spatComputes, ArrayRef<IndexedEdge> edges, MLIRContext* context) {
  GraphDCP graphDCP(spatComputes, edges);
  if (coresCount.getValue() > 0)
    graphDCP.setMaxCpuCount(static_cast<int>(coresCount.getValue()));
@@ -383,12 +377,12 @@ DCPAnalysisResult runLegacyDcp(llvm::ArrayRef<SpatCompute> spatComputes,
 SpatCompute getOriginalSpatCompute(Operation* op) {
  if (!op)
    return {};
-  while (auto extract = llvm::dyn_cast<tensor::ExtractSliceOp>(op)) {
+  while (auto extract = dyn_cast<tensor::ExtractSliceOp>(op)) {
    op = extract.getSource().getDefiningOp();
    if (!op)
      return {};
  }
-  if (auto res = llvm::dyn_cast<SpatCompute>(op))
+  if (auto res = dyn_cast<SpatCompute>(op))
    return res;
  return {};
 }
@@ -1,5 +1,3 @@
-# SPDX-License-Identifier: Apache-2.0
-
 add_custom_target(pim-unittest)
 set_target_properties(pim-unittest PROPERTIES FOLDER "Tests")
Author	SHA1	Message	Date
NiccoloN	cff929a083	fix sigmoid implementation stability in pim-simulator Validate Operations / validate-operations (push) Successful in 23m4s Details	2026-04-23 10:34:29 +02:00
NiccoloN	89b3501aa8	fix weightAlways attribute in spatial	2026-04-23 10:04:47 +02:00
NiccoloN	412ca957f6	multiple-output spat computes Validate Operations / validate-operations (push) Successful in 22m38s Details	2026-04-23 09:28:57 +02:00