From 0f240af271fbf64bcbb91142cf6c0a6a35faf5c6 Mon Sep 17 00:00:00 2001 From: NiccoloN Date: Mon, 25 May 2026 20:58:51 +0200 Subject: [PATCH] cleanup unused channel operations and related logic --- src/PIM/Compiler/PimCodeGen.cpp | 101 -------- src/PIM/Compiler/PimCodeGen.hpp | 10 - .../BatchCoreLoweringPatterns.cpp | 103 -------- .../SpatialToPim/ChannelLoweringPatterns.cpp | 59 +---- .../SpatialToPim/CoreLoweringPatterns.cpp | 33 --- .../SpatialToPim/ReturnPathNormalization.cpp | 2 - .../SpatialToPim/SpatialToPimPass.cpp | 10 - src/PIM/Dialect/Pim/Pim.td | 102 -------- src/PIM/Dialect/Pim/PimOpsAsm.cpp | 226 ------------------ src/PIM/Dialect/Pim/PimOpsVerify.cpp | 75 ------ .../OpBufferizationInterfaces.cpp | 148 ------------ src/PIM/Dialect/Spatial/Spatial.td | 105 -------- src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp | 115 --------- .../MergeComputeNodesPass.cpp | 11 +- src/PIM/Pass/PimCodegen/VerificationPass.cpp | 85 ------- 15 files changed, 3 insertions(+), 1182 deletions(-) diff --git a/src/PIM/Compiler/PimCodeGen.cpp b/src/PIM/Compiler/PimCodeGen.cpp index e576b78..1cfd637 100644 --- a/src/PIM/Compiler/PimCodeGen.cpp +++ b/src/PIM/Compiler/PimCodeGen.cpp @@ -519,61 +519,12 @@ void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp, const StaticValue emitCommunicationOp("recv", addressOf(receiveOp.getOutputBuffer(), knowledge), *sourceCoreId, receiveOp.getSize()); } -void PimCodeGen::codeGenReceiveTensorOp(pim::PimReceiveTensorOp receiveTensorOp, - const StaticValueKnowledge& knowledge) const { - size_t outputAddr = addressOf(receiveTensorOp.getOutputBuffer(), knowledge); - size_t chunkSize = getShapedTypeSizeInBytes(cast(receiveTensorOp.getOutputBuffer().getType())) - / receiveTensorOp.getSourceCoreIds().size(); - for (auto [chunkIndex, sourceCoreId] : llvm::enumerate(receiveTensorOp.getSourceCoreIds())) - emitCommunicationOp("recv", outputAddr + chunkIndex * chunkSize, sourceCoreId, chunkSize); -} - -void PimCodeGen::codeGenReceiveBatchOp(pim::PimReceiveBatchOp receiveOp, - unsigned lane, - const StaticValueKnowledge& knowledge) const { - emitCommunicationOp( - "recv", addressOf(receiveOp.getOutputBuffer(), knowledge), receiveOp.getSourceCoreIds()[lane], receiveOp.getSize()); -} - -void PimCodeGen::codeGenReceiveTensorBatchOp(pim::PimReceiveTensorBatchOp receiveOp, - ArrayRef laneCoreIds, - const StaticValueKnowledge& knowledge) const { - size_t outputAddr = addressOf(receiveOp.getOutputBuffer(), knowledge); - size_t chunkSize = getShapedTypeSizeInBytes(cast(receiveOp.getOutputBuffer().getType())) - / laneCoreIds.size(); - for (auto [chunkIndex, sourceCoreId] : llvm::enumerate(laneCoreIds)) - emitCommunicationOp("recv", outputAddr + chunkIndex * chunkSize, sourceCoreId, chunkSize); -} - void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const { auto targetCoreId = indexOf(sendOp.getTargetCoreId(), knowledge); assert(succeeded(targetCoreId) && "pim.send target core id must be statically resolvable during codegen"); emitCommunicationOp("send", addressOf(sendOp.getInput(), knowledge), *targetCoreId, sendOp.getSize()); } -void PimCodeGen::codeGenSendTensorOp(pim::PimSendTensorOp sendTensorOp, const StaticValueKnowledge& knowledge) const { - size_t inputAddr = addressOf(sendTensorOp.getInput(), knowledge); - size_t chunkSize = getShapedTypeSizeInBytes(cast(sendTensorOp.getInput().getType())) - / sendTensorOp.getTargetCoreIds().size(); - for (auto [chunkIndex, targetCoreId] : llvm::enumerate(sendTensorOp.getTargetCoreIds())) - emitCommunicationOp("send", inputAddr + chunkIndex * chunkSize, targetCoreId, chunkSize); -} - -void PimCodeGen::codeGenSendBatchOp(pim::PimSendBatchOp sendOp, - unsigned lane, - const StaticValueKnowledge& knowledge) const { - emitCommunicationOp("send", addressOf(sendOp.getInput(), knowledge), sendOp.getTargetCoreIds()[lane], sendOp.getSize()); -} - -void PimCodeGen::codeGenSendTensorBatchOp(pim::PimSendTensorBatchOp sendOp, - ArrayRef laneCoreIds, - const StaticValueKnowledge& knowledge) const { - size_t inputAddr = addressOf(sendOp.getInput(), knowledge); - size_t chunkSize = getShapedTypeSizeInBytes(cast(sendOp.getInput().getType())) / laneCoreIds.size(); - for (auto [chunkIndex, targetCoreId] : llvm::enumerate(laneCoreIds)) - emitCommunicationOp("send", inputAddr + chunkIndex * chunkSize, targetCoreId, chunkSize); -} - void PimCodeGen::codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const { auto outputType = cast(concatOp.getOutputBuffer().getType()); assert(outputType.hasStaticShape() && "concat codegen requires static output shape"); @@ -902,13 +853,7 @@ enum class CompiledCoreOpKind : uint8_t { Store, Lmv, Receive, - ReceiveBatch, - ReceiveTensor, - ReceiveTensorBatch, Send, - SendBatch, - SendTensor, - SendTensorBatch, Concat, Vmm, Transpose, @@ -952,20 +897,8 @@ static FailureOr classifyCompiledCoreOpKind(Operation& op) { return CompiledCoreOpKind::Lmv; if (isa(op)) return CompiledCoreOpKind::Receive; - if (isa(op)) - return CompiledCoreOpKind::ReceiveBatch; - if (isa(op)) - return CompiledCoreOpKind::ReceiveTensor; - if (isa(op)) - return CompiledCoreOpKind::ReceiveTensorBatch; if (isa(op)) return CompiledCoreOpKind::Send; - if (isa(op)) - return CompiledCoreOpKind::SendBatch; - if (isa(op)) - return CompiledCoreOpKind::SendTensor; - if (isa(op)) - return CompiledCoreOpKind::SendTensorBatch; if (isa(op)) return CompiledCoreOpKind::Concat; if (isa(op)) @@ -1108,43 +1041,9 @@ static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl(node.op), knowledge); break; - case CompiledCoreOpKind::ReceiveBatch: - if (!batchLane) - return failure(); - coreCodeGen.codeGenReceiveBatchOp(cast(node.op), *batchLane, knowledge); - break; - case CompiledCoreOpKind::ReceiveTensor: - coreCodeGen.codeGenReceiveTensorOp(cast(node.op), knowledge); - break; - case CompiledCoreOpKind::ReceiveTensorBatch: - if (!batchLane || !batchLaneCount) - return failure(); - coreCodeGen.codeGenReceiveTensorBatchOp(cast(node.op), - getLaneChunkCoreIds(cast(node.op).getSourceCoreIds(), - *batchLaneCount, - *batchLane), - knowledge); - break; case CompiledCoreOpKind::Send: coreCodeGen.codeGenSendOp(cast(node.op), knowledge); break; - case CompiledCoreOpKind::SendBatch: - if (!batchLane) - return failure(); - coreCodeGen.codeGenSendBatchOp(cast(node.op), *batchLane, knowledge); - break; - case CompiledCoreOpKind::SendTensor: - coreCodeGen.codeGenSendTensorOp(cast(node.op), knowledge); - break; - case CompiledCoreOpKind::SendTensorBatch: - if (!batchLane || !batchLaneCount) - return failure(); - coreCodeGen.codeGenSendTensorBatchOp(cast(node.op), - getLaneChunkCoreIds(cast(node.op).getTargetCoreIds(), - *batchLaneCount, - *batchLane), - knowledge); - break; case CompiledCoreOpKind::Concat: coreCodeGen.codeGenConcatOp(cast(node.op), knowledge); break; diff --git a/src/PIM/Compiler/PimCodeGen.hpp b/src/PIM/Compiler/PimCodeGen.hpp index 23db991..a9d2dac 100644 --- a/src/PIM/Compiler/PimCodeGen.hpp +++ b/src/PIM/Compiler/PimCodeGen.hpp @@ -178,17 +178,7 @@ public: void codeGenLmvOp(pim::PimMemCopyOp lmvOp, const StaticValueKnowledge& knowledge) const; void codeGenReceiveOp(pim::PimReceiveOp receiveOp, const StaticValueKnowledge& knowledge) const; - void codeGenReceiveTensorOp(pim::PimReceiveTensorOp receiveTensorOp, const StaticValueKnowledge& knowledge) const; void codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const; - void codeGenSendTensorOp(pim::PimSendTensorOp sendTensorOp, const StaticValueKnowledge& knowledge) const; - void codeGenReceiveBatchOp(pim::PimReceiveBatchOp receiveOp, unsigned lane, const StaticValueKnowledge& knowledge) const; - void codeGenReceiveTensorBatchOp(pim::PimReceiveTensorBatchOp receiveOp, - llvm::ArrayRef laneCoreIds, - const StaticValueKnowledge& knowledge) const; - void codeGenSendBatchOp(pim::PimSendBatchOp sendOp, unsigned lane, const StaticValueKnowledge& knowledge) const; - void codeGenSendTensorBatchOp(pim::PimSendTensorBatchOp sendOp, - llvm::ArrayRef laneCoreIds, - const StaticValueKnowledge& knowledge) const; void codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const; template diff --git a/src/PIM/Conversion/SpatialToPim/BatchCoreLoweringPatterns.cpp b/src/PIM/Conversion/SpatialToPim/BatchCoreLoweringPatterns.cpp index f863957..97d1d81 100644 --- a/src/PIM/Conversion/SpatialToPim/BatchCoreLoweringPatterns.cpp +++ b/src/PIM/Conversion/SpatialToPim/BatchCoreLoweringPatterns.cpp @@ -18,27 +18,6 @@ using namespace onnx_mlir::pim; namespace onnx_mlir { namespace { -static int32_t translateSpatialCoreIdToPimCoreId(size_t spatialCoreId) { return static_cast(spatialCoreId); } - -static FailureOr getConstantI32Value(Value value) { - APInt constantValue; - if (!matchPattern(value, m_ConstantInt(&constantValue))) - return failure(); - return static_cast(constantValue.getSExtValue()); -} - -static FailureOr> getConstantI32Values(ValueRange values) { - SmallVector constants; - constants.reserve(values.size()); - for (Value value : values) { - FailureOr constantValue = getConstantI32Value(value); - if (failed(constantValue)) - return failure(); - constants.push_back(*constantValue); - } - return constants; -} - static bool isExplicitHostOperand(Operation* op, unsigned operandIndex) { if (isa(op)) return operandIndex == 2; @@ -62,43 +41,6 @@ static SmallVector getPimCoreIdsForBatchOp(spatial::SpatComputeBatch co return coreIds; } -static LogicalResult lowerChannelSendTensorBatch(spatial::SpatChannelSendTensorBatchOp sendTensorBatchOp, - IRMapping& mapper, - IRRewriter& rewriter) { - FailureOr> targetCoreIds = getConstantI32Values(sendTensorBatchOp.getTargetCoreIds()); - if (failed(targetCoreIds)) - return sendTensorBatchOp.emitOpError("expected constant targetCoreIds"); - for (int32_t& targetCoreId : *targetCoreIds) - targetCoreId = translateSpatialCoreIdToPimCoreId(targetCoreId); - - pim::PimSendTensorBatchOp::create(rewriter, - sendTensorBatchOp.getLoc(), - mapper.lookup(sendTensorBatchOp.getInput()), - rewriter.getDenseI32ArrayAttr(*targetCoreIds)); - return success(); -} - -static LogicalResult lowerChannelReceiveTensorBatch(spatial::SpatChannelReceiveTensorBatchOp receiveTensorBatchOp, - IRMapping& mapper, - IRRewriter& rewriter) { - FailureOr> sourceCoreIds = getConstantI32Values(receiveTensorBatchOp.getSourceCoreIds()); - if (failed(sourceCoreIds)) - return receiveTensorBatchOp.emitOpError("expected constant sourceCoreIds"); - for (int32_t& sourceCoreId : *sourceCoreIds) - sourceCoreId = translateSpatialCoreIdToPimCoreId(sourceCoreId); - - auto outputType = cast(receiveTensorBatchOp.getOutput().getType()); - auto outputBuffer = createEmptyTensorFromShaped(rewriter, receiveTensorBatchOp.getLoc(), outputType); - Value received = pim::PimReceiveTensorBatchOp::create(rewriter, - receiveTensorBatchOp.getLoc(), - outputBuffer.getType(), - outputBuffer, - rewriter.getDenseI32ArrayAttr(*sourceCoreIds)) - .getOutput(); - mapper.map(receiveTensorBatchOp.getOutput(), received); - return success(); -} - static FailureOr getDirectReturnOperandIndex(OpResult result) { if (!result.hasOneUse()) return failure(); @@ -304,51 +246,6 @@ LogicalResult raptor::SpatialToPimPass::lowerComputeBatchOp(spatial::SpatCompute continue; } - if (auto sendBatchOp = dyn_cast(op)) { - FailureOr> targetCoreIds = getConstantI32Values(sendBatchOp.getTargetCoreIds()); - if (failed(targetCoreIds)) - return sendBatchOp.emitOpError("expected constant targetCoreIds"); - for (int32_t& targetCoreId : *targetCoreIds) - targetCoreId = translateSpatialCoreIdToPimCoreId(targetCoreId); - pim::PimSendBatchOp::create(rewriter, - loc, - mapper.lookup(sendBatchOp.getInput()), - getTensorSizeInBytesAttr(rewriter, mapper.lookup(sendBatchOp.getInput())), - rewriter.getDenseI32ArrayAttr(*targetCoreIds)); - continue; - } - - if (auto sendTensorBatchOp = dyn_cast(op)) { - if (failed(lowerChannelSendTensorBatch(sendTensorBatchOp, mapper, rewriter))) - return failure(); - continue; - } - - if (auto receiveBatchOp = dyn_cast(op)) { - FailureOr> sourceCoreIds = getConstantI32Values(receiveBatchOp.getSourceCoreIds()); - if (failed(sourceCoreIds)) - return receiveBatchOp.emitOpError("expected constant sourceCoreIds"); - for (int32_t& sourceCoreId : *sourceCoreIds) - sourceCoreId = translateSpatialCoreIdToPimCoreId(sourceCoreId); - auto outputType = cast(receiveBatchOp.getOutput().getType()); - auto outputBuffer = createEmptyTensorFromShaped(rewriter, loc, outputType); - auto received = pim::PimReceiveBatchOp::create(rewriter, - loc, - outputBuffer.getType(), - outputBuffer, - getTensorSizeInBytesAttr(rewriter, receiveBatchOp.getOutput()), - rewriter.getDenseI32ArrayAttr(*sourceCoreIds)) - .getOutput(); - mapper.map(receiveBatchOp.getOutput(), received); - continue; - } - - if (auto receiveTensorBatchOp = dyn_cast(op)) { - if (failed(lowerChannelReceiveTensorBatch(receiveTensorBatchOp, mapper, rewriter))) - return failure(); - continue; - } - if (auto toTensorOp = dyn_cast(op)) { if (isa_and_present(toTensorOp.getBuffer().getDefiningOp())) { Operation* cloned = rewriter.clone(op, mapper); diff --git a/src/PIM/Conversion/SpatialToPim/ChannelLoweringPatterns.cpp b/src/PIM/Conversion/SpatialToPim/ChannelLoweringPatterns.cpp index 88de311..33cdc24 100644 --- a/src/PIM/Conversion/SpatialToPim/ChannelLoweringPatterns.cpp +++ b/src/PIM/Conversion/SpatialToPim/ChannelLoweringPatterns.cpp @@ -1,6 +1,4 @@ #include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/Matchers.h" - #include "src/Accelerators/PIM/Conversion/SpatialToPim/ChannelLoweringPatterns.hpp" #include "src/Accelerators/PIM/Conversion/SpatialToPim/Common.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" @@ -11,20 +9,6 @@ using namespace mlir; namespace onnx_mlir { namespace { -static int32_t toPimCoreId(int32_t spatialCoreId) { return spatialCoreId; } - -static FailureOr> getConstantI32Values(ValueRange values) { - SmallVector constants; - constants.reserve(values.size()); - for (Value value : values) { - APInt constantValue; - if (!matchPattern(value, m_ConstantInt(&constantValue))) - return failure(); - constants.push_back(static_cast(constantValue.getSExtValue())); - } - return constants; -} - struct ChannelSendLowering : OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -59,42 +43,6 @@ struct ChannelReceiveLowering : OpRewritePattern } }; -struct ChannelSendTensorLowering : OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(spatial::SpatChannelSendTensorOp op, PatternRewriter& rewriter) const override { - FailureOr> targetCoreIds = getConstantI32Values(op.getTargetCoreIds()); - if (failed(targetCoreIds)) - return rewriter.notifyMatchFailure(op, "expected constant targetCoreIds"); - for (int32_t& targetCoreId : *targetCoreIds) - targetCoreId = toPimCoreId(targetCoreId); - pim::PimSendTensorOp::create(rewriter, op.getLoc(), op.getInput(), rewriter.getDenseI32ArrayAttr(*targetCoreIds)); - rewriter.eraseOp(op); - return success(); - } -}; - -struct ChannelReceiveTensorLowering : OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(spatial::SpatChannelReceiveTensorOp op, PatternRewriter& rewriter) const override { - FailureOr> sourceCoreIds = getConstantI32Values(op.getSourceCoreIds()); - if (failed(sourceCoreIds)) - return rewriter.notifyMatchFailure(op, "expected constant sourceCoreIds"); - for (int32_t& sourceCoreId : *sourceCoreIds) - sourceCoreId = toPimCoreId(sourceCoreId); - auto outputType = cast(op.getOutput().getType()); - Value outputBuffer = - tensor::EmptyOp::create(rewriter, op.getLoc(), outputType.getShape(), outputType.getElementType()).getResult(); - Value received = - pim::PimReceiveTensorOp::create( - rewriter, op.getLoc(), op.getOutput().getType(), outputBuffer, rewriter.getDenseI32ArrayAttr(*sourceCoreIds)) - .getOutput(); - rewriter.replaceOp(op, received); - return success(); - } -}; - struct ExtractRowsLowering : OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -137,12 +85,7 @@ struct ConcatLowering : OpRewritePattern { } // namespace void populateChannelLoweringPatterns(RewritePatternSet& patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } } // namespace onnx_mlir diff --git a/src/PIM/Conversion/SpatialToPim/CoreLoweringPatterns.cpp b/src/PIM/Conversion/SpatialToPim/CoreLoweringPatterns.cpp index ce204ca..345a459 100644 --- a/src/PIM/Conversion/SpatialToPim/CoreLoweringPatterns.cpp +++ b/src/PIM/Conversion/SpatialToPim/CoreLoweringPatterns.cpp @@ -53,20 +53,6 @@ cloneMappedHelperOperands(Operation* op, IRMapping& mapping, IRRewriter& rewrite } } -static int32_t translateSpatialCoreIdToPimCoreId(size_t spatialCoreId) { return static_cast(spatialCoreId); } - -static FailureOr> getConstantI32Values(ValueRange values) { - SmallVector constants; - constants.reserve(values.size()); - for (Value value : values) { - APInt constantValue; - if (!matchPattern(value, m_ConstantInt(&constantValue))) - return failure(); - constants.push_back(static_cast(constantValue.getSExtValue())); - } - return constants; -} - static int32_t getPimCoreIdForComputeOp(spatial::SpatCompute computeOp, size_t& fallbackCoreId) { if (auto spatialCoreIdAttr = computeOp->getAttrOfType(onnx_mlir::kCoreIdAttrName)) return static_cast(spatialCoreIdAttr.getInt()); @@ -186,25 +172,6 @@ LogicalResult raptor::SpatialToPimPass::lowerComputeOp(spatial::SpatCompute comp continue; } - auto receiveTensorOp = dyn_cast_or_null(input.getDefiningOp()); - if (receiveTensorOp && !blockArg->use_empty()) { - FailureOr> sourceCoreIds = getConstantI32Values(receiveTensorOp.getSourceCoreIds()); - if (failed(sourceCoreIds)) - return receiveTensorOp.emitOpError("expected constant sourceCoreIds"); - for (int32_t& sourceCoreId : *sourceCoreIds) - sourceCoreId = translateSpatialCoreIdToPimCoreId(sourceCoreId); - rewriter.setInsertionPoint(getEarliestUserWithinBlock(*blockArg)); - auto outputType = cast(blockArg->getType()); - auto outputBuffer = createEmptyTensorFromShaped(rewriter, receiveTensorOp.getLoc(), outputType); - Value received = PimReceiveTensorOp::create(rewriter, - receiveTensorOp.getLoc(), - outputBuffer.getType(), - outputBuffer, - rewriter.getDenseI32ArrayAttr(*sourceCoreIds)) - .getOutput(); - blockArg->replaceAllUsesWith(received); - markOpToRemove(receiveTensorOp); - } } if (computeOp.getNumResults() != yieldOp.getNumOperands()) diff --git a/src/PIM/Conversion/SpatialToPim/ReturnPathNormalization.cpp b/src/PIM/Conversion/SpatialToPim/ReturnPathNormalization.cpp index b258253..6efa6b5 100644 --- a/src/PIM/Conversion/SpatialToPim/ReturnPathNormalization.cpp +++ b/src/PIM/Conversion/SpatialToPim/ReturnPathNormalization.cpp @@ -607,8 +607,6 @@ void raptor::SpatialToPimPass::replaceReturnWithOutputBuffers(func::ReturnOp ret return; } - if (auto receiveTensorOp = dyn_cast(op)) - markOpToRemove(receiveTensorOp); }; SmallVector originalOperands(returnOp.getOperands().begin(), returnOp.getOperands().end()); diff --git a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp index 7e58697..1b06339 100644 --- a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp +++ b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp @@ -156,11 +156,7 @@ void onnx_mlir::raptor::SpatialToPimPass::runOnOperation() { BuiltinDialect>(); target.addLegalOp(); RewritePatternSet initialPatterns(ctx); @@ -234,11 +230,7 @@ void onnx_mlir::raptor::SpatialToPimPass::runOnOperation() { BuiltinDialect>(); coreBodyTarget.addLegalOp(); SmallVector coreOps; @@ -282,9 +274,7 @@ void onnx_mlir::raptor::SpatialToPimPass::runOnOperation() { communicationTarget.addLegalOp(); communicationTarget.addIllegalOp(); RewritePatternSet communicationPatterns(ctx); diff --git a/src/PIM/Dialect/Pim/Pim.td b/src/PIM/Dialect/Pim/Pim.td index 2ade4fd..343ca9e 100644 --- a/src/PIM/Dialect/Pim/Pim.td +++ b/src/PIM/Dialect/Pim/Pim.td @@ -102,42 +102,6 @@ def PimSendOp : PimOp<"send", []> { }]; } -def PimSendTensorOp : PimOp<"send_tensor", []> { - let summary = "Send equal contiguous chunks of one tensor to target cores"; - - let arguments = (ins - PimTensor:$input, - DenseI32ArrayAttr:$targetCoreIds - ); - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; -} - -def PimSendBatchOp : PimOp<"send_batch", []> { - let summary = "Send a per-lane tensor to target cores from a batched core"; - - let arguments = (ins - PimTensor:$input, - I32Attr:$size, - DenseI32ArrayAttr:$targetCoreIds - ); - - let hasCustomAssemblyFormat = 1; -} - -def PimSendTensorBatchOp : PimOp<"send_tensor_batch", []> { - let summary = "Send equal contiguous chunks of one per-lane tensor from a batched core"; - - let arguments = (ins - PimTensor:$input, - DenseI32ArrayAttr:$targetCoreIds - ); - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; -} - def PimReceiveOp : PimOp<"receive", [DestinationStyleOpInterface]> { let summary = "Receive a tensor from another core"; @@ -162,72 +126,6 @@ def PimReceiveOp : PimOp<"receive", [DestinationStyleOpInterface]> { }]; } -def PimReceiveTensorOp : PimOp<"receive_tensor", [DestinationStyleOpInterface]> { - let summary = "Receive equal contiguous chunks from source cores into one tensor"; - - let arguments = (ins - PimTensor:$outputBuffer, - DenseI32ArrayAttr:$sourceCoreIds - ); - - let results = (outs - PimTensor:$output - ); - - let extraClassDeclaration = [{ - mlir::MutableOperandRange getDpsInitsMutable() { - return getOutputBufferMutable(); - } - }]; - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; -} - -def PimReceiveBatchOp : PimOp<"receive_batch", [DestinationStyleOpInterface]> { - let summary = "Receive per-lane tensors from source cores into a batched core"; - - let arguments = (ins - PimTensor:$outputBuffer, - I32Attr:$size, - DenseI32ArrayAttr:$sourceCoreIds - ); - - let results = (outs - PimTensor:$output - ); - - let extraClassDeclaration = [{ - mlir::MutableOperandRange getDpsInitsMutable() { - return getOutputBufferMutable(); - } - }]; - - let hasCustomAssemblyFormat = 1; -} - -def PimReceiveTensorBatchOp : PimOp<"receive_tensor_batch", [DestinationStyleOpInterface]> { - let summary = "Receive equal contiguous chunks into one per-lane tensor inside a batched core"; - - let arguments = (ins - PimTensor:$outputBuffer, - DenseI32ArrayAttr:$sourceCoreIds - ); - - let results = (outs - PimTensor:$output - ); - - let extraClassDeclaration = [{ - mlir::MutableOperandRange getDpsInitsMutable() { - return getOutputBufferMutable(); - } - }]; - - let hasVerifier = 1; - let hasCustomAssemblyFormat = 1; -} - def PimMemCopyHostToDevOp : PimOp<"memcp_hd", [DestinationStyleOpInterface]> { let summary = "Copy a memory region from host memory into device memory"; diff --git a/src/PIM/Dialect/Pim/PimOpsAsm.cpp b/src/PIM/Dialect/Pim/PimOpsAsm.cpp index 2bd4df5..eedd8ef 100644 --- a/src/PIM/Dialect/Pim/PimOpsAsm.cpp +++ b/src/PIM/Dialect/Pim/PimOpsAsm.cpp @@ -28,34 +28,6 @@ static bool parseOptionalKeywordAlias(OpAsmParser& parser, StringRef preferred, return succeeded(parser.parseOptionalKeyword(preferred)) || succeeded(parser.parseOptionalKeyword(legacy)); } -static void printBlockArgumentList(OpAsmPrinter& printer, ArrayRef arguments) { - printer << "("; - for (auto [index, argument] : llvm::enumerate(arguments)) { - if (index != 0) - printer << ", "; - printer.printOperand(argument); - } - printer << ")"; -} - -static ParseResult parseBlockArgumentList(OpAsmParser& parser, SmallVectorImpl& arguments) { - if (parser.parseLParen()) - return failure(); - if (succeeded(parser.parseOptionalRParen())) - return success(); - - OpAsmParser::Argument argument; - if (parser.parseArgument(argument)) - return failure(); - arguments.push_back(argument); - while (succeeded(parser.parseOptionalComma())) { - if (parser.parseArgument(argument)) - return failure(); - arguments.push_back(argument); - } - return parser.parseRParen(); -} - static void printBoundValueList(OpAsmPrinter& printer, ValueRange arguments, ValueRange operands, ListDelimiter delimiter) { printCompressedValueList(printer, arguments, delimiter); @@ -98,12 +70,6 @@ static void printCoreIdList(OpAsmPrinter& printer, StringRef keyword, ArrayRef& coreIds) { - if (failed(parser.parseOptionalKeyword(keyword))) - return success(); - return parseCompressedIntegerList(parser, coreIds); -} - } // namespace void PimCoreOp::print(OpAsmPrinter& printer) { @@ -295,198 +261,6 @@ ParseResult PimYieldOp::parse(OpAsmParser& parser, OperationState& result) { return parser.resolveOperands(outputs, outputTypes, parser.getCurrentLocation(), result.operands); } -void PimSendBatchOp::print(OpAsmPrinter& printer) { - printer << " "; - printer.printOperand(getInput()); - printCoreIdList(printer, "to", getTargetCoreIds()); - printer.printOptionalAttrDict((*this)->getAttrs(), {getTargetCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getInput().getType()); -} - -ParseResult PimSendBatchOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand input; - Type inputType; - SmallVector targetCoreIds; - - if (parser.parseOperand(input) || parseOptionalCoreIdList(parser, "to", targetCoreIds) - || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseType(inputType)) - return failure(); - - if (!targetCoreIds.empty() && result.attributes.get("targetCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "targetCoreIds cannot be specified both positionally and in attr-dict"); - if (!targetCoreIds.empty()) - result.addAttribute("targetCoreIds", getDenseI32ArrayAttr(parser, targetCoreIds)); - - return parser.resolveOperand(input, inputType, result.operands); -} - -void PimSendTensorBatchOp::print(OpAsmPrinter& printer) { - printer << " "; - printer.printOperand(getInput()); - printCoreIdList(printer, "to", getTargetCoreIds()); - printer.printOptionalAttrDict((*this)->getAttrs(), {getTargetCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getInput().getType()); -} - -ParseResult PimSendTensorBatchOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand input; - Type inputType; - SmallVector targetCoreIds; - - if (parser.parseOperand(input) || parseOptionalCoreIdList(parser, "to", targetCoreIds) - || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseType(inputType)) - return failure(); - - if (!targetCoreIds.empty() && result.attributes.get("targetCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "targetCoreIds cannot be specified both positionally and in attr-dict"); - if (!targetCoreIds.empty()) - result.addAttribute("targetCoreIds", getDenseI32ArrayAttr(parser, targetCoreIds)); - - return parser.resolveOperand(input, inputType, result.operands); -} - -void PimSendTensorOp::print(OpAsmPrinter& printer) { - printer << " "; - printer.printOperand(getInput()); - printCoreIdList(printer, "to", getTargetCoreIds()); - printer.printOptionalAttrDict((*this)->getAttrs(), {getTargetCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getInput().getType()); -} - -ParseResult PimSendTensorOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand input; - Type inputType; - SmallVector targetCoreIds; - - if (parser.parseOperand(input) || parseOptionalCoreIdList(parser, "to", targetCoreIds) - || parser.parseOptionalAttrDict(result.attributes) || parser.parseColon() || parser.parseType(inputType)) - return failure(); - - if (!targetCoreIds.empty() && result.attributes.get("targetCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "targetCoreIds cannot be specified both positionally and in attr-dict"); - if (!targetCoreIds.empty()) - result.addAttribute("targetCoreIds", getDenseI32ArrayAttr(parser, targetCoreIds)); - - return parser.resolveOperand(input, inputType, result.operands); -} - -void PimReceiveTensorOp::print(OpAsmPrinter& printer) { - printCoreIdList(printer, "from", getSourceCoreIds()); - printer << " into "; - printOpenDelimiter(printer, ListDelimiter::Paren); - printer.printOperand(getOutputBuffer()); - printCloseDelimiter(printer, ListDelimiter::Paren); - printer.printOptionalAttrDict((*this)->getAttrs(), {getSourceCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getOutputBuffer().getType()); - printer << " -> "; - printer.printType(getOutput().getType()); -} - -ParseResult PimReceiveTensorOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand outputBuffer; - Type outputBufferType; - Type outputType; - SmallVector sourceCoreIds; - - if (parseOptionalCoreIdList(parser, "from", sourceCoreIds) || parser.parseKeyword("into") || parser.parseLParen() - || parser.parseOperand(outputBuffer) || parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) - || parser.parseColon() || parser.parseType(outputBufferType) || parser.parseArrow() - || parser.parseType(outputType)) - return failure(); - - if (!sourceCoreIds.empty() && result.attributes.get("sourceCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "sourceCoreIds cannot be specified both positionally and in attr-dict"); - if (!sourceCoreIds.empty()) - result.addAttribute("sourceCoreIds", getDenseI32ArrayAttr(parser, sourceCoreIds)); - - if (parser.resolveOperand(outputBuffer, outputBufferType, result.operands)) - return failure(); - result.addTypes(outputType); - return success(); -} - -void PimReceiveBatchOp::print(OpAsmPrinter& printer) { - printCoreIdList(printer, "from", getSourceCoreIds()); - printer << " into "; - printOpenDelimiter(printer, ListDelimiter::Paren); - printer.printOperand(getOutputBuffer()); - printCloseDelimiter(printer, ListDelimiter::Paren); - printer.printOptionalAttrDict((*this)->getAttrs(), {getSourceCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getOutputBuffer().getType()); - printer << " -> "; - printer.printType(getOutput().getType()); -} - -ParseResult PimReceiveBatchOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand outputBuffer; - Type outputBufferType; - Type outputType; - SmallVector sourceCoreIds; - - if (parseOptionalCoreIdList(parser, "from", sourceCoreIds) || parser.parseKeyword("into") || parser.parseLParen() - || parser.parseOperand(outputBuffer) || parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) - || parser.parseColon() || parser.parseType(outputBufferType) || parser.parseArrow() - || parser.parseType(outputType)) - return failure(); - - if (!sourceCoreIds.empty() && result.attributes.get("sourceCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "sourceCoreIds cannot be specified both positionally and in attr-dict"); - if (!sourceCoreIds.empty()) - result.addAttribute("sourceCoreIds", getDenseI32ArrayAttr(parser, sourceCoreIds)); - - if (parser.resolveOperand(outputBuffer, outputBufferType, result.operands)) - return failure(); - result.addTypes(outputType); - return success(); -} - -void PimReceiveTensorBatchOp::print(OpAsmPrinter& printer) { - printCoreIdList(printer, "from", getSourceCoreIds()); - printer << " into "; - printOpenDelimiter(printer, ListDelimiter::Paren); - printer.printOperand(getOutputBuffer()); - printCloseDelimiter(printer, ListDelimiter::Paren); - printer.printOptionalAttrDict((*this)->getAttrs(), {getSourceCoreIdsAttrName().getValue()}); - printer << " : "; - printer.printType(getOutputBuffer().getType()); - printer << " -> "; - printer.printType(getOutput().getType()); -} - -ParseResult PimReceiveTensorBatchOp::parse(OpAsmParser& parser, OperationState& result) { - OpAsmParser::UnresolvedOperand outputBuffer; - Type outputBufferType; - Type outputType; - SmallVector sourceCoreIds; - - if (parseOptionalCoreIdList(parser, "from", sourceCoreIds) || parser.parseKeyword("into") || parser.parseLParen() - || parser.parseOperand(outputBuffer) || parser.parseRParen() || parser.parseOptionalAttrDict(result.attributes) - || parser.parseColon() || parser.parseType(outputBufferType) || parser.parseArrow() - || parser.parseType(outputType)) - return failure(); - - if (!sourceCoreIds.empty() && result.attributes.get("sourceCoreIds")) - return parser.emitError(parser.getCurrentLocation(), - "sourceCoreIds cannot be specified both positionally and in attr-dict"); - if (!sourceCoreIds.empty()) - result.addAttribute("sourceCoreIds", getDenseI32ArrayAttr(parser, sourceCoreIds)); - - if (parser.resolveOperand(outputBuffer, outputBufferType, result.operands)) - return failure(); - result.addTypes(outputType); - return success(); -} - void PimConcatOp::print(OpAsmPrinter& printer) { printer << " axis " << getAxis() << " "; printCompressedValueSequence(printer, getInputs()); diff --git a/src/PIM/Dialect/Pim/PimOpsVerify.cpp b/src/PIM/Dialect/Pim/PimOpsVerify.cpp index 3b9f25d..168f0d8 100644 --- a/src/PIM/Dialect/Pim/PimOpsVerify.cpp +++ b/src/PIM/Dialect/Pim/PimOpsVerify.cpp @@ -90,56 +90,6 @@ static LogicalResult verifyCompatibleShapedTypes(Operation* op, Type lhs, Type r return success(); } -static LogicalResult verifyTensorCommunication(Operation* op, Type type, ArrayRef coreIds, StringRef kind) { - if (coreIds.empty()) - return op->emitError() << kind << " must carry at least one chunk"; - - auto shapedType = dyn_cast(type); - if (!shapedType || !shapedType.hasStaticShape()) - return op->emitError() << kind << " requires a static shaped tensor or memref"; - - int64_t elementBits = shapedType.getElementTypeBitWidth(); - if (elementBits <= 0 || elementBits % 8 != 0) - return op->emitError() << kind << " requires byte-sized elements"; - - int64_t totalBytes = shapedType.getNumElements() * elementBits / 8; - if (totalBytes % static_cast(coreIds.size()) != 0) - return op->emitError() << kind << " tensor byte size must be divisible by the number of core ids"; - - return success(); -} - -static LogicalResult -verifyTensorBatchCommunication(Operation* op, Type type, ArrayRef coreIds, StringRef kind) { - if (coreIds.empty()) - return op->emitError() << kind << " must carry at least one chunk"; - - auto coreBatchOp = op->getParentOfType(); - if (!coreBatchOp) - return op->emitError() << kind << " must be nested inside pim.core_batch"; - - int32_t laneCount = coreBatchOp.getLaneCount(); - if (laneCount <= 0) - return op->emitError() << kind << " requires a positive parent laneCount"; - if (coreIds.size() % static_cast(laneCount) != 0) - return op->emitError() << kind << " core id count must be divisible by the parent laneCount"; - - auto shapedType = dyn_cast(type); - if (!shapedType || !shapedType.hasStaticShape()) - return op->emitError() << kind << " requires a static shaped tensor or memref"; - - int64_t elementBits = shapedType.getElementTypeBitWidth(); - if (elementBits <= 0 || elementBits % 8 != 0) - return op->emitError() << kind << " requires byte-sized elements"; - - int64_t chunkCount = static_cast(coreIds.size()) / laneCount; - int64_t totalBytes = shapedType.getNumElements() * elementBits / 8; - if (totalBytes % chunkCount != 0) - return op->emitError() << kind << " tensor byte size must be divisible by the chunk count per lane"; - - return success(); -} - static FailureOr> getWeightShapeForVMM(Value weight) { auto shapedType = dyn_cast(weight.getType()); if (!shapedType) @@ -177,31 +127,6 @@ LogicalResult PimCoreBatchOp::verify() { return verifyOnlyConstantExternalValues(getOperation(), getBody(), "pim.core_batch"); } -LogicalResult PimSendTensorOp::verify() { - return verifyTensorCommunication(getOperation(), getInput().getType(), getTargetCoreIds(), "send_tensor"); -} - -LogicalResult PimSendTensorBatchOp::verify() { - return verifyTensorBatchCommunication(getOperation(), getInput().getType(), getTargetCoreIds(), "send_tensor_batch"); -} - -LogicalResult PimReceiveTensorOp::verify() { - if (failed(verifyCompatibleShapedTypes( - getOperation(), getOutputBuffer().getType(), getOutput().getType(), "output buffer and output must match"))) - return failure(); - - return verifyTensorCommunication(getOperation(), getOutput().getType(), getSourceCoreIds(), "receive_tensor"); -} - -LogicalResult PimReceiveTensorBatchOp::verify() { - if (failed(verifyCompatibleShapedTypes( - getOperation(), getOutputBuffer().getType(), getOutput().getType(), "output buffer and output must match"))) - return failure(); - - return verifyTensorBatchCommunication( - getOperation(), getOutput().getType(), getSourceCoreIds(), "receive_tensor_batch"); -} - LogicalResult PimVMMOp::verify() { if (failed(verifyCompatibleShapedTypes( getOperation(), getOutputBuffer().getType(), getOutput().getType(), "output buffer and output must match"))) diff --git a/src/PIM/Dialect/Pim/Transforms/Bufferization/OpBufferizationInterfaces.cpp b/src/PIM/Dialect/Pim/Transforms/Bufferization/OpBufferizationInterfaces.cpp index 1bf7c48..61ce65c 100644 --- a/src/PIM/Dialect/Pim/Transforms/Bufferization/OpBufferizationInterfaces.cpp +++ b/src/PIM/Dialect/Pim/Transforms/Bufferization/OpBufferizationInterfaces.cpp @@ -157,72 +157,6 @@ struct ReceiveOpInterface : DstBufferizableOpInterfaceExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return !cast(op).isDpsInit(&opOperand); - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto receiveOp = cast(op); - auto outputBufferOpt = getBufferOrValue(rewriter, receiveOp.getOutputBuffer(), options, state); - if (failed(outputBufferOpt)) - return failure(); - - replaceOpWithNewBufferizedOp(rewriter, - op, - outputBufferOpt->getType(), - *outputBufferOpt, - receiveOp.getSizeAttr(), - receiveOp.getSourceCoreIdsAttr()); - return success(); - } -}; - -struct ReceiveTensorOpInterface -: DstBufferizableOpInterfaceExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return !cast(op).isDpsInit(&opOperand); - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto receiveOp = cast(op); - auto outputBufferOpt = getBufferOrValue(rewriter, receiveOp.getOutputBuffer(), options, state); - if (failed(outputBufferOpt)) - return failure(); - - replaceOpWithNewBufferizedOp( - rewriter, op, outputBufferOpt->getType(), *outputBufferOpt, receiveOp.getSourceCoreIdsAttr()); - return success(); - } -}; - -struct ReceiveTensorBatchOpInterface -: DstBufferizableOpInterfaceExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return !cast(op).isDpsInit(&opOperand); - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto receiveOp = cast(op); - auto outputBufferOpt = getBufferOrValue(rewriter, receiveOp.getOutputBuffer(), options, state); - if (failed(outputBufferOpt)) - return failure(); - - replaceOpWithNewBufferizedOp( - rewriter, op, outputBufferOpt->getType(), *outputBufferOpt, receiveOp.getSourceCoreIdsAttr()); - return success(); - } -}; - struct ConcatOpInterface : DstBufferizableOpInterfaceExternalModel { bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return !cast(op).isDpsInit(&opOperand); @@ -252,30 +186,6 @@ struct ConcatOpInterface : DstBufferizableOpInterfaceExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; } - - bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; } - - AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return {}; - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto sendOp = cast(op); - auto inputOpt = getBufferOrValue(rewriter, sendOp.getInput(), options, state); - if (failed(inputOpt)) - return failure(); - - replaceOpWithNewBufferizedOp( - rewriter, op, materializeContiguousMemRef(*inputOpt, op->getLoc(), rewriter), sendOp.getTargetCoreIdsAttr()); - return success(); - } -}; - struct SendOpInterface : BufferizableOpInterface::ExternalModel { bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; } @@ -303,58 +213,6 @@ struct SendOpInterface : BufferizableOpInterface::ExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; } - - bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; } - - AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return {}; - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto sendOp = cast(op); - auto inputOpt = getBufferOrValue(rewriter, sendOp.getInput(), options, state); - if (failed(inputOpt)) - return failure(); - - replaceOpWithNewBufferizedOp(rewriter, - op, - materializeContiguousMemRef(*inputOpt, op->getLoc(), rewriter), - sendOp.getSizeAttr(), - sendOp.getTargetCoreIdsAttr()); - return success(); - } -}; - -struct SendTensorBatchOpInterface -: BufferizableOpInterface::ExternalModel { - bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; } - - bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; } - - AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { - return {}; - } - - LogicalResult bufferize(Operation* op, - RewriterBase& rewriter, - const BufferizationOptions& options, - BufferizationState& state) const { - auto sendOp = cast(op); - auto inputOpt = getBufferOrValue(rewriter, sendOp.getInput(), options, state); - if (failed(inputOpt)) - return failure(); - - replaceOpWithNewBufferizedOp( - rewriter, op, materializeContiguousMemRef(*inputOpt, op->getLoc(), rewriter), sendOp.getTargetCoreIdsAttr()); - return success(); - } -}; - struct CoreOpInterface : BufferizableOpInterface::ExternalModel { bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; } @@ -699,13 +557,7 @@ void registerOpBufferizationInterfaces(DialectRegistry& registry) { PimCoreOp::attachInterface(*ctx); PimCoreBatchOp::attachInterface(*ctx); PimReceiveOp::attachInterface(*ctx); - PimReceiveTensorOp::attachInterface(*ctx); - PimReceiveTensorBatchOp::attachInterface(*ctx); - PimReceiveBatchOp::attachInterface(*ctx); PimSendOp::attachInterface(*ctx); - PimSendBatchOp::attachInterface(*ctx); - PimSendTensorBatchOp::attachInterface(*ctx); - PimSendTensorOp::attachInterface(*ctx); PimConcatOp::attachInterface(*ctx); PimMemCopyHostToDevOp::attachInterface(*ctx); PimMemCopyHostToDevBatchOp::attachInterface(*ctx); diff --git a/src/PIM/Dialect/Spatial/Spatial.td b/src/PIM/Dialect/Spatial/Spatial.td index fff7641..38d0a9f 100644 --- a/src/PIM/Dialect/Spatial/Spatial.td +++ b/src/PIM/Dialect/Spatial/Spatial.td @@ -194,111 +194,6 @@ def SpatChannelReceiveOp : SpatOp<"channel_receive", []> { }]; } -def SpatChannelSendTensorOp : SpatOp<"channel_send_tensor", [AttrSizedOperandSegments]> { - let summary = "Send equal contiguous chunks of one tensor through logical channels"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds, - SpatTensor:$input - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - $input `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($input) - }]; -} - -def SpatChannelReceiveTensorOp : SpatOp<"channel_receive_tensor", [AttrSizedOperandSegments]> { - let summary = "Receive equal contiguous chunks of one tensor from logical channels"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds - ); - - let results = (outs - SpatTensor:$output - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($output) - }]; -} - -def SpatChannelSendBatchOp : SpatOp<"channel_send_batch", [AttrSizedOperandSegments]> { - let summary = "Send per-lane tensors through logical channels in a batch body"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds, - SpatTensor:$input - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - $input `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($input) - }]; -} - -def SpatChannelSendTensorBatchOp : SpatOp<"channel_send_tensor_batch", [AttrSizedOperandSegments]> { - let summary = "Send equal contiguous chunks of one per-lane tensor through logical channels in a batch body"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds, - SpatTensor:$input - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - $input `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($input) - }]; -} - -def SpatChannelReceiveBatchOp : SpatOp<"channel_receive_batch", [AttrSizedOperandSegments]> { - let summary = "Receive a per-lane tensor through logical channels in a batch body"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds - ); - - let results = (outs - SpatTensor:$output - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($output) - }]; -} - -def SpatChannelReceiveTensorBatchOp : SpatOp<"channel_receive_tensor_batch", [AttrSizedOperandSegments]> { - let summary = "Receive equal contiguous chunks of one per-lane tensor through logical channels in a batch body"; - - let arguments = (ins - Variadic:$channelIds, - Variadic:$sourceCoreIds, - Variadic:$targetCoreIds - ); - - let results = (outs - SpatTensor:$output - ); - - let hasVerifier = 1; - let assemblyFormat = [{ - `channels` `(` $channelIds `)` `from` `(` $sourceCoreIds `)` `to` `(` $targetCoreIds `)` attr-dict `:` type($output) - }]; -} - //===----------------------------------------------------------------------===// // Math //===----------------------------------------------------------------------===// diff --git a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp index 38646c4..9b315fa 100644 --- a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp +++ b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp @@ -95,13 +95,6 @@ static FailureOr> getWeightShapeForWeightedOp(Value weight) { return shapedType.getShape(); } -static FailureOr getParentBatchLaneCount(Operation* op) { - auto batchOp = op->getParentOfType(); - if (!batchOp) - return failure(); - return batchOp.getLaneCount(); -} - static bool isBatchOutputArgument(SpatComputeBatch batchOp, Value value) { if (batchOp.getNumResults() == 0) return false; @@ -233,68 +226,6 @@ static LogicalResult verifyStaticUnitStrideParallelInsertSliceOp(tensor::Paralle return success(); } -static LogicalResult verifyTensorChannelSizes( - Operation* op, Type type, size_t channelCount, size_t sourceCoreCount, size_t targetCoreCount, StringRef kind) { - if (channelCount != sourceCoreCount || channelCount != targetCoreCount) - return op->emitError("channelIds, sourceCoreIds, and targetCoreIds must have the same length"); - if (channelCount == 0) - return op->emitError() << kind << " must carry at least one chunk"; - - auto shapedType = dyn_cast(type); - if (!shapedType || !shapedType.hasStaticShape()) - return op->emitError() << kind << " requires a static shaped tensor"; - - int64_t elementBits = shapedType.getElementTypeBitWidth(); - if (elementBits <= 0 || elementBits % 8 != 0) - return op->emitError() << kind << " requires byte-sized elements"; - - int64_t totalBytes = shapedType.getNumElements() * elementBits / 8; - if (totalBytes % static_cast(channelCount) != 0) - return op->emitError() << kind << " tensor byte size must be divisible by the number of channel ids"; - return success(); -} - -static LogicalResult -verifyBatchChannelSizes(Operation* op, size_t channelCount, size_t sourceCoreCount, size_t targetCoreCount) { - if (channelCount != sourceCoreCount || channelCount != targetCoreCount) - return op->emitError("channelIds, sourceCoreIds, and targetCoreIds must have the same length"); - - auto laneCount = getParentBatchLaneCount(op); - if (failed(laneCount)) - return op->emitError("must be nested inside spat.compute_batch"); - if (channelCount != static_cast(*laneCount)) - return op->emitError("channel metadata length must match parent laneCount"); - - return success(); -} - -static LogicalResult verifyTensorBatchChannelSizes( - Operation* op, Type type, size_t channelCount, size_t sourceCoreCount, size_t targetCoreCount, StringRef kind) { - if (channelCount != sourceCoreCount || channelCount != targetCoreCount) - return op->emitError("channelIds, sourceCoreIds, and targetCoreIds must have the same length"); - - auto laneCount = getParentBatchLaneCount(op); - if (failed(laneCount)) - return op->emitError("must be nested inside spat.compute_batch"); - if (channelCount == 0 || channelCount % static_cast(*laneCount) != 0) - return op->emitError() << kind << " channel metadata length must be a positive multiple of parent laneCount"; - - auto shapedType = dyn_cast(type); - if (!shapedType || !shapedType.hasStaticShape()) - return op->emitError() << kind << " requires a static shaped tensor"; - - int64_t elementBits = shapedType.getElementTypeBitWidth(); - if (elementBits <= 0 || elementBits % 8 != 0) - return op->emitError() << kind << " requires byte-sized elements"; - - int64_t chunkCount = static_cast(channelCount) / *laneCount; - int64_t totalBytes = shapedType.getNumElements() * elementBits / 8; - if (totalBytes % chunkCount != 0) - return op->emitError() << kind << " tensor byte size must be divisible by the chunk count per lane"; - - return success(); -} - static Region* getParentRegion(Value value) { if (auto blockArg = dyn_cast(value)) return blockArg.getOwner()->getParent(); @@ -564,52 +495,6 @@ LogicalResult SpatCompute::verify() { return success(); } -LogicalResult SpatChannelSendTensorOp::verify() { - return verifyTensorChannelSizes(getOperation(), - getInput().getType(), - getChannelIds().size(), - getSourceCoreIds().size(), - getTargetCoreIds().size(), - "channel_send_tensor"); -} - -LogicalResult SpatChannelReceiveTensorOp::verify() { - return verifyTensorChannelSizes(getOperation(), - getOutput().getType(), - getChannelIds().size(), - getSourceCoreIds().size(), - getTargetCoreIds().size(), - "channel_receive_tensor"); -} - -LogicalResult SpatChannelSendBatchOp::verify() { - return verifyBatchChannelSizes( - getOperation(), getChannelIds().size(), getSourceCoreIds().size(), getTargetCoreIds().size()); -} - -LogicalResult SpatChannelSendTensorBatchOp::verify() { - return verifyTensorBatchChannelSizes(getOperation(), - getInput().getType(), - getChannelIds().size(), - getSourceCoreIds().size(), - getTargetCoreIds().size(), - "channel_send_tensor_batch"); -} - -LogicalResult SpatChannelReceiveBatchOp::verify() { - return verifyBatchChannelSizes( - getOperation(), getChannelIds().size(), getSourceCoreIds().size(), getTargetCoreIds().size()); -} - -LogicalResult SpatChannelReceiveTensorBatchOp::verify() { - return verifyTensorBatchChannelSizes(getOperation(), - getOutput().getType(), - getChannelIds().size(), - getSourceCoreIds().size(), - getTargetCoreIds().size(), - "channel_receive_tensor_batch"); -} - LogicalResult SpatComputeBatch::verify() { int32_t count = getLaneCount(); if (count <= 0) diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp index e5bd394..4552762 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp @@ -79,8 +79,6 @@ struct MergeIrCounts { uint64_t topLevelComputeBatchCount = 0; uint64_t scalarChannelSendCount = 0; uint64_t scalarChannelReceiveCount = 0; - uint64_t tensorChannelSendCount = 0; - uint64_t tensorChannelReceiveCount = 0; uint64_t wvmmCount = 0; uint64_t vaddCount = 0; uint64_t scfForCount = 0; @@ -95,10 +93,6 @@ MergeIrCounts collectMergeIrCounts(func::FuncOp funcOp) { ++counts.scalarChannelSendCount; else if (isa(nestedOp)) ++counts.scalarChannelReceiveCount; - else if (isa(nestedOp)) - ++counts.tensorChannelSendCount; - else if (isa(nestedOp)) - ++counts.tensorChannelReceiveCount; else if (isa(nestedOp)) ++counts.wvmmCount; else if (isa(nestedOp)) @@ -130,9 +124,8 @@ void emitMergeIrCounts(StringRef phaseName, func::FuncOp funcOp) { << " compute=" << counts.topLevelComputeCount << " compute_batch=" << counts.topLevelComputeBatchCount << " scalar_send=" << counts.scalarChannelSendCount << " scalar_recv=" << counts.scalarChannelReceiveCount - << " tensor_send=" << counts.tensorChannelSendCount - << " tensor_recv=" << counts.tensorChannelReceiveCount << " wvmm=" << counts.wvmmCount - << " vadd=" << counts.vaddCount << " scf_for=" << counts.scfForCount << "\n"; + << " wvmm=" << counts.wvmmCount << " vadd=" << counts.vaddCount + << " scf_for=" << counts.scfForCount << "\n"; } static std::optional getComputeCoreId(SpatCompute compute) { diff --git a/src/PIM/Pass/PimCodegen/VerificationPass.cpp b/src/PIM/Pass/PimCodegen/VerificationPass.cpp index 2b44771..8e1dcfc 100644 --- a/src/PIM/Pass/PimCodegen/VerificationPass.cpp +++ b/src/PIM/Pass/PimCodegen/VerificationPass.cpp @@ -150,13 +150,7 @@ static bool isSupportedCoreInstructionOp(Operation* op) { pim::PimMemCopyDevToHostOp, pim::PimMemCopyOp, pim::PimReceiveOp, - pim::PimReceiveBatchOp, - pim::PimReceiveTensorOp, - pim::PimReceiveTensorBatchOp, pim::PimSendOp, - pim::PimSendBatchOp, - pim::PimSendTensorOp, - pim::PimSendTensorBatchOp, pim::PimConcatOp, pim::PimVMMOp, pim::PimTransposeOp, @@ -173,18 +167,6 @@ static bool isSupportedCoreInstructionOp(Operation* op) { memref::GetGlobalOp>(op); } -static FailureOr getStaticByteSizedShapedType(Type type) { - auto shapedType = dyn_cast(type); - if (!shapedType || !shapedType.hasStaticShape()) - return failure(); - - int64_t elementBits = shapedType.getElementTypeBitWidth(); - if (elementBits <= 0 || elementBits % 8 != 0) - return failure(); - - return shapedType; -} - static LogicalResult verifyBatchOpSemantics(Operation& op, const StaticValueKnowledge& knowledge, pim::CappedDiagnosticReporter& diagnostics) { @@ -203,73 +185,6 @@ static LogicalResult verifyBatchOpSemantics(Operation& op, return success(!hasFailure); } - if (auto sendBatchOp = dyn_cast(op)) { - if (sendBatchOp.getTargetCoreIds().size() != static_cast(sendBatchOp->getParentOfType() - .getLaneCount())) { - reportFailure([](Operation* illegalOp) { - illegalOp->emitOpError("targetCoreIds size must match parent laneCount"); - }); - } - return success(!hasFailure); - } - - if (auto receiveBatchOp = dyn_cast(op)) { - if (receiveBatchOp.getSourceCoreIds().size() - != static_cast(receiveBatchOp->getParentOfType().getLaneCount())) { - reportFailure([](Operation* illegalOp) { - illegalOp->emitOpError("sourceCoreIds size must match parent laneCount"); - }); - } - return success(!hasFailure); - } - - auto verifyTensorBatchCommunication = [&](Value tensorValue, ArrayRef coreIds, StringRef kind) { - if (coreIds.empty()) { - reportFailure([&](Operation* illegalOp) { illegalOp->emitOpError() << kind << " must carry at least one chunk"; }); - return; - } - - auto parentBatchOp = op.getParentOfType(); - int32_t laneCount = parentBatchOp.getLaneCount(); - if (laneCount <= 0) { - reportFailure([&](Operation* illegalOp) { - illegalOp->emitOpError() << kind << " requires a positive parent laneCount"; - }); - return; - } - if (coreIds.size() % static_cast(laneCount) != 0) { - reportFailure([&](Operation* illegalOp) { - illegalOp->emitOpError() << kind << " core id count must be divisible by the parent laneCount"; - }); - return; - } - - auto shapedType = getStaticByteSizedShapedType(tensorValue.getType()); - if (failed(shapedType)) { - reportFailure([&](Operation* illegalOp) { - illegalOp->emitOpError() << kind << " requires a static shaped tensor or memref with byte-sized elements"; - }); - return; - } - - int64_t chunkCount = static_cast(coreIds.size()) / laneCount; - int64_t totalBytes = (*shapedType).getNumElements() * (*shapedType).getElementTypeBitWidth() / 8; - if (totalBytes % chunkCount != 0) { - reportFailure([&](Operation* illegalOp) { - illegalOp->emitOpError() << kind << " tensor byte size must be divisible by the chunk count per lane"; - }); - } - }; - - if (auto sendTensorBatchOp = dyn_cast(op)) - verifyTensorBatchCommunication(sendTensorBatchOp.getInput(), - sendTensorBatchOp.getTargetCoreIds(), - "send_tensor_batch"); - else if (auto receiveTensorBatchOp = dyn_cast(op)) - verifyTensorBatchCommunication(receiveTensorBatchOp.getOutput(), - receiveTensorBatchOp.getSourceCoreIds(), - "receive_tensor_batch"); - return success(!hasFailure); }