From e263e05f56e64d28c01c74329721f900b2ada699 Mon Sep 17 00:00:00 2001 From: NiccoloN Date: Mon, 18 May 2026 18:32:40 +0200 Subject: [PATCH] remove dead logic --- .../SpatialToPim/SpatialToPimPass.cpp | 2 +- src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp | 4 +- .../MaterializeMergeSchedule.cpp | 107 +++++------------- 3 files changed, 29 insertions(+), 84 deletions(-) diff --git a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp index 5d79acb..56176d7 100644 --- a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp +++ b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp @@ -138,7 +138,7 @@ static Value padHVectorInputToCrossbarSize(IRRewriter& rewriter, Location loc, V } void SpatialToPimPass::runOnOperation() { - coreId = 1; + coreId = 0; ModuleOp moduleOp = getOperation(); MLIRContext* ctx = moduleOp.getContext(); diff --git a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp index b7622d3..183c0cd 100644 --- a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp +++ b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp @@ -480,8 +480,8 @@ LogicalResult SpatComputeBatch::verify() { return emitError("compute_batch coreIds attribute must be a dense i32 array"); if (coreIdsAttr.size() != static_cast(laneCountSz)) return emitError("compute_batch coreIds array length must match laneCount"); - if (llvm::any_of(coreIdsAttr.asArrayRef(), [](int32_t coreId) { return coreId <= 0; })) - return emitError("compute_batch coreIds values must be positive"); + if (llvm::any_of(coreIdsAttr.asArrayRef(), [](int32_t coreId) { return coreId < 0; })) + return emitError("compute_batch coreIds values must be non-negative"); llvm::SmallDenseSet seenCoreIds; for (int32_t coreId : coreIdsAttr.asArrayRef()) if (!seenCoreIds.insert(coreId).second) diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp index 8dabf3f..9c6fd46 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp @@ -1,5 +1,4 @@ #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/PatternMatch.h" @@ -35,8 +34,6 @@ using spatial::getComputeInstanceTemplateBlock; using spatial::getComputeInstanceWeights; using spatial::getProducerValueRef; -static int32_t getPhysicalCoreId(size_t schedulerCpu) { return static_cast(schedulerCpu + 1); } - class MergeScheduleMaterializerImpl { public: explicit MergeScheduleMaterializerImpl(func::FuncOp funcOp) @@ -64,10 +61,8 @@ public: private: struct ScheduledTask { ComputeInstance computeInstance; - Operation* sourceOp = nullptr; size_t cpu = 0; - size_t order = 0; - size_t executionOrder = 0; + size_t orderWithinCpu = 0; }; struct ChannelInfo { @@ -78,7 +73,6 @@ private: struct CpuProgram { SpatCompute op; - Block* block = nullptr; DenseMap externalInputMap; DenseMap weightToIndex; }; @@ -103,43 +97,6 @@ private: | static_cast(channelInfo.targetCoreId); } - static void appendUniqueValue(SmallVectorImpl& values, DenseSet& seen, Value value) { - if (seen.insert(value).second) - values.push_back(value); - } - - bool isOldComputeResult(Operation* op) { - auto it = isInternalInputOpCache.find(op); - if (it != isInternalInputOpCache.end()) - return it->second; - - auto extract = dyn_cast_or_null(op); - if (!extract) - return isInternalInputOpCache[op] = false; - - for (Value result : extract->getResults()) { - for (Operation* user : result.getUsers()) { - if (oldComputeOps.contains(user)) - continue; - if (isOldComputeResult(user)) - continue; - return isInternalInputOpCache[op] = false; - } - } - return isInternalInputOpCache[op] = true; - } - - void collectInternalInputOps(Value value) { - Operation* op = value.getDefiningOp(); - //TODO ExtractSliceOp is not the only legal host op to traverse! dio - while (auto extract = dyn_cast_if_present(op)) { - if (isOldComputeResult(extract.getOperation())) - internalInputOpsToErase.insert(extract.getOperation()); - value = extract.getSource(); - op = value.getDefiningOp(); - } - } - void collectExternalUsers(Operation* op) { if (!externalUsersToMove.insert(op).second) return; @@ -153,14 +110,11 @@ private: } void collectScheduledTasks() { - size_t nextOrder = 0; for (ComputeInstance scheduledInstance : schedule->dominanceOrderCompute) { oldComputeOps.insert(scheduledInstance.op); scheduledTasks.push_back({scheduledInstance, - scheduledInstance.op, schedule->computeToCpuMap.lookup(scheduledInstance), - schedule->computeToCpuSlotMap.lookup(scheduledInstance), - nextOrder++}); + schedule->computeToCpuSlotMap.lookup(scheduledInstance)}); } } @@ -177,14 +131,10 @@ private: } llvm::sort(orderedCpus); - for (size_t cpu : orderedCpus) { - llvm::stable_sort(tasksByCpu[cpu], - [&](const ScheduledTask& lhs, const ScheduledTask& rhs) { return lhs.order < rhs.order; }); - for (auto [executionOrder, task] : llvm::enumerate(tasksByCpu[cpu])) { - task.executionOrder = executionOrder; - taskByComputeInstance[task.computeInstance].executionOrder = executionOrder; - } - } + for (size_t cpu : orderedCpus) + llvm::stable_sort(tasksByCpu[cpu], [&](const ScheduledTask& lhs, const ScheduledTask& rhs) { + return lhs.orderWithinCpu < rhs.orderWithinCpu; + }); } void collectExternalInputsAndWeights() { @@ -203,26 +153,26 @@ private: for (auto [inputIndex, input] : llvm::enumerate(taskInputs)) { auto producerRef = getProducerValueRef(input); if (producerRef) { - collectInternalInputOps(input); auto producerIt = taskByComputeInstance.find(producerRef->instance); if (producerIt != taskByComputeInstance.end()) { if (producerIt->second.cpu != cpu) { ChannelInfo info { (*nextChannelId)++, - getPhysicalCoreId(producerIt->second.cpu), - getPhysicalCoreId(cpu), + static_cast(producerIt->second.cpu), + static_cast(cpu), }; remoteInputs[inputIndex] = info; auto& perResultChannels = remoteSendsByTask[producerRef->instance]; if (perResultChannels.empty()) perResultChannels.resize(getComputeInstanceOutputTypes(producerIt->second.computeInstance).size()); perResultChannels[producerRef->resultIndex].push_back( - {info, task.computeInstance, inputIndex, task.executionOrder, 0}); + {info, task.computeInstance, inputIndex, task.orderWithinCpu, 0}); } continue; } } - appendUniqueValue(cpuExternalInputs[cpu], seenExternalInputsByCpu[cpu], input); + if (seenExternalInputsByCpu[cpu].insert(input).second) + cpuExternalInputs[cpu].push_back(input); } auto taskOutputs = getComputeInstanceOutputValues(task.computeInstance); @@ -314,7 +264,7 @@ private: uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo); if (!pairsNeedingReceiveReorder.contains(pairKey)) continue; - size_t targetCpu = static_cast(sendInfo.channelInfo.targetCoreId - 1); + size_t targetCpu = static_cast(sendInfo.channelInfo.targetCoreId); receiveQueuesByCpu[targetCpu][pairKey].push_back( {sendInfo.channelInfo, sendInfo.consumer, sendInfo.inputIndex, sendInfo.sourceOrder}); } @@ -351,7 +301,7 @@ private: auto newCompute = SpatCompute::create(rewriter, loc, TypeRange(resultTypes), ValueRange(operands)); newCompute.getProperties().setOperandSegmentSizes( {static_cast(cpuWeights[cpu].size()), static_cast(cpuExternalInputs[cpu].size())}); - newCompute->setAttr(onnx_mlir::kCoreIdAttrName, rewriter.getI32IntegerAttr(getPhysicalCoreId(cpu))); + newCompute->setAttr(onnx_mlir::kCoreIdAttrName, rewriter.getI32IntegerAttr(static_cast(cpu))); SmallVector blockArgTypes; SmallVector blockArgLocs; @@ -366,7 +316,6 @@ private: CpuProgram program; program.op = newCompute; - program.block = newBlock; for (auto [weightIndex, weight] : llvm::enumerate(cpuWeights[cpu])) program.weightToIndex[weight] = weightIndex; for (auto [inputIndex, input] : llvm::enumerate(cpuExternalInputs[cpu])) @@ -428,7 +377,7 @@ private: for (size_t cpu : orderedCpus) { CpuProgram& program = cpuPrograms[cpu]; IRRewriter rewriter(func.getContext()); - rewriter.setInsertionPointToEnd(program.block); + rewriter.setInsertionPointToEnd(&program.op.getBody().front()); DenseMap receiveQueueIndices; DenseMap> preReceivedInputsByTask; @@ -458,7 +407,7 @@ private: if (producerIt->second.cpu == cpu) { auto producedIt = producedValuesByTask.find(producerRef->instance); if (producedIt == producedValuesByTask.end() || producedIt->second.size() <= producerRef->resultIndex) { - task.sourceOp->emitOpError("missing local producer value during per-cpu merge materialization") + task.computeInstance.op->emitOpError("missing local producer value during per-cpu merge materialization") << " consumerCpu=" << cpu << " producerCpu=" << producerIt->second.cpu << " producerLaneStart=" << producerRef->instance.laneStart << " producerLaneCount=" << producerRef->instance.laneCount; @@ -482,7 +431,7 @@ private: task.computeInstance, inputIndex); if (failed(received)) { - task.sourceOp->emitOpError("failed to materialize reordered remote receive") + task.computeInstance.op->emitOpError("failed to materialize reordered remote receive") << " consumerCpu=" << cpu << " sourceCoreId=" << channelInfo.sourceCoreId << " targetCoreId=" << channelInfo.targetCoreId << " channelId=" << channelInfo.channelId; return failure(); @@ -505,8 +454,8 @@ private: } SmallVector taskYieldValues; - rewriter.setInsertionPointToEnd(program.block); - if (isa(task.sourceOp)) { + rewriter.setInsertionPointToEnd(&program.op.getBody().front()); + if (isa(task.computeInstance.op)) { IRMapping mapper; for (auto [argIndex, oldArg] : llvm::enumerate(templateBlock.getArguments())) mapper.map(oldArg, resolvedInputs[argIndex]); @@ -547,7 +496,8 @@ private: Operation* clonedOp = rewriter.clone(op, mapper); if (auto oldWeightedMvmOp = dyn_cast(&op)) { if (oldWeightedMvmOp.getWeightIndex() != 0) { - task.sourceOp->emitOpError("batched per-cpu merge materialization expects lane-local weight index 0"); + task.computeInstance.op->emitOpError( + "batched per-cpu merge materialization expects lane-local weight index 0"); return failure(); } auto newWeightedMvmOp = cast(clonedOp); @@ -555,7 +505,8 @@ private: } if (auto oldWeightedVmmOp = dyn_cast(&op)) { if (oldWeightedVmmOp.getWeightIndex() != 0) { - task.sourceOp->emitOpError("batched per-cpu merge materialization expects lane-local weight index 0"); + task.computeInstance.op->emitOpError( + "batched per-cpu merge materialization expects lane-local weight index 0"); return failure(); } auto newWeightedVmmOp = cast(clonedOp); @@ -589,7 +540,7 @@ private: auto producedIt = producedValuesByTask.find(outputRef.instance); if (producedIt == producedValuesByTask.end() || producedIt->second.size() <= outputRef.resultIndex) { ScheduledTask task = taskByComputeInstance.at(outputRef.instance); - task.sourceOp->emitOpError("missing yielded external value during per-cpu merge materialization") + task.computeInstance.op->emitOpError("missing yielded external value during per-cpu merge materialization") << " cpu=" << cpu << " laneStart=" << outputRef.instance.laneStart; return failure(); } @@ -610,13 +561,9 @@ private: } LogicalResult eraseOldScheduledOps() { - DenseSet allOpsToErase = oldComputeOps; - for (Operation* op : internalInputOpsToErase) - allOpsToErase.insert(op); - SmallVector orderedOpsToErase; for (Operation& op : func.getBody().front()) - if (allOpsToErase.contains(&op)) + if (oldComputeOps.contains(&op)) orderedOpsToErase.push_back(&op); for (Operation* op : llvm::reverse(orderedOpsToErase)) { @@ -626,10 +573,10 @@ private: remainingUsers.push_back(user); if (!remainingUsers.empty()) { InFlightDiagnostic diagnostic = op->emitOpError("still has uses during per-cpu merge cleanup") - << "; erase-set=" << (allOpsToErase.contains(op) ? "yes" : "no"); + << "; erase-set=" << (oldComputeOps.contains(op) ? "yes" : "no"); for (Operation* user : remainingUsers) { diagnostic.attachNote(user->getLoc()) - << "remaining user " << user->getName() << "; erase-set=" << (allOpsToErase.contains(user) ? "yes" : "no"); + << "remaining user " << user->getName() << "; erase-set=" << (oldComputeOps.contains(user) ? "yes" : "no"); } return failure(); } @@ -663,8 +610,6 @@ private: DenseMap> tasksByCpu; SmallVector orderedCpus; DenseSet seenCpus; - DenseSet internalInputOpsToErase; - DenseMap isInternalInputOpCache; DenseSet externalUsersToMove; DenseMap>> remoteSendsByTask; DenseMap>> remoteInputsByTask;