Compare commits
3 Commits
ea61540e08
...
8d95c604a6
| Author | SHA1 | Date | |
|---|---|---|---|
| 8d95c604a6 | |||
| 55eda487dc | |||
| 061139aefb |
@@ -1,7 +1,6 @@
|
|||||||
#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
|
|
||||||
|
|
||||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
|
||||||
|
|
||||||
#include "llvm/Support/Format.h"
|
#include "llvm/Support/Format.h"
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/Support/FileSystemUtils.hpp"
|
#include "src/Accelerators/PIM/Common/Support/FileSystemUtils.hpp"
|
||||||
|
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||||
|
|
||||||
namespace onnx_mlir {
|
namespace onnx_mlir {
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,9 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llvm/ADT/STLExtras.h"
|
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
#include "llvm/ADT/ArrayRef.h"
|
||||||
|
#include "llvm/ADT/STLExtras.h"
|
||||||
#include "llvm/Support/raw_ostream.h"
|
#include "llvm/Support/raw_ostream.h"
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|||||||
@@ -70,9 +70,7 @@ inline void writeUint32LE(llvm::raw_ostream& os, uint32_t value) {
|
|||||||
os.write(bytes.data(), bytes.size());
|
os.write(bytes.data(), bytes.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void writeInt32LE(llvm::raw_ostream& os, int32_t value) {
|
inline void writeInt32LE(llvm::raw_ostream& os, int32_t value) { writeUint32LE(os, static_cast<uint32_t>(value)); }
|
||||||
writeUint32LE(os, static_cast<uint32_t>(value));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void writeHeader(llvm::raw_ostream& os) {
|
inline void writeHeader(llvm::raw_ostream& os) {
|
||||||
os.write(kMagic, sizeof(kMagic));
|
os.write(kMagic, sizeof(kMagic));
|
||||||
@@ -235,9 +233,7 @@ inline InstructionRecord makeInstructionRecord(const llvm::json::Object& instruc
|
|||||||
case Opcode::sldi:
|
case Opcode::sldi:
|
||||||
case Opcode::saddi:
|
case Opcode::saddi:
|
||||||
case Opcode::smuli:
|
case Opcode::smuli:
|
||||||
case Opcode::lldi:
|
case Opcode::lldi: record.r2OrImm = getOptionalInt(instruction, "imm"); break;
|
||||||
record.r2OrImm = getOptionalInt(instruction, "imm");
|
|
||||||
break;
|
|
||||||
case Opcode::mvmul:
|
case Opcode::mvmul:
|
||||||
record.r2OrImm = getOptionalInt(instruction, "mbiw");
|
record.r2OrImm = getOptionalInt(instruction, "mbiw");
|
||||||
record.generic1 = getOptionalInt(instruction, "relu");
|
record.generic1 = getOptionalInt(instruction, "relu");
|
||||||
@@ -252,9 +248,7 @@ inline InstructionRecord makeInstructionRecord(const llvm::json::Object& instruc
|
|||||||
record.r2OrImm = getOptionalInt(instruction, "core");
|
record.r2OrImm = getOptionalInt(instruction, "core");
|
||||||
record.generic3 = getOptionalInt(instruction, "size");
|
record.generic3 = getOptionalInt(instruction, "size");
|
||||||
break;
|
break;
|
||||||
default:
|
default: record.r2OrImm = getOptionalInt(instruction, "rs2"); break;
|
||||||
record.r2OrImm = getOptionalInt(instruction, "rs2");
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (record.opcode != Opcode::mvmul && record.opcode != Opcode::setbw) {
|
if (record.opcode != Opcode::mvmul && record.opcode != Opcode::setbw) {
|
||||||
@@ -371,8 +365,7 @@ inline llvm::json::Object makeInstructionJson(const InstructionRecord& record) {
|
|||||||
break;
|
break;
|
||||||
case Opcode::wait:
|
case Opcode::wait:
|
||||||
case Opcode::sync:
|
case Opcode::sync:
|
||||||
case Opcode::nop:
|
case Opcode::nop: break;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return instruction;
|
return instruction;
|
||||||
|
|||||||
@@ -367,7 +367,7 @@ void PimCodeGen::emitMemCopyOp(StringRef opName,
|
|||||||
instruction.generic1 = 0;
|
instruction.generic1 = 0;
|
||||||
instruction.generic2 = 0;
|
instruction.generic2 = 0;
|
||||||
instruction.generic3 = static_cast<int32_t>(size);
|
instruction.generic3 = static_cast<int32_t>(size);
|
||||||
(void)sizeFieldName;
|
(void) sizeFieldName;
|
||||||
emitInstruction(instruction);
|
emitInstruction(instruction);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
#include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
@@ -75,15 +74,13 @@ struct PackSpatialConcatInputsPattern final : OpRewritePattern<spatial::SpatConc
|
|||||||
return failure();
|
return failure();
|
||||||
|
|
||||||
auto outputType = cast<ShapedType>(concatOp.getOutput().getType());
|
auto outputType = cast<ShapedType>(concatOp.getOutput().getType());
|
||||||
auto newConcat = pim::PimConcatOp::create(rewriter,
|
auto newConcat = pim::PimConcatOp::create(
|
||||||
|
rewriter,
|
||||||
concatOp.getLoc(),
|
concatOp.getLoc(),
|
||||||
concatOp.getOutput().getType(),
|
concatOp.getOutput().getType(),
|
||||||
concatOp.getAxisAttr(),
|
concatOp.getAxisAttr(),
|
||||||
ValueRange(packedInputs),
|
ValueRange(packedInputs),
|
||||||
tensor::EmptyOp::create(rewriter,
|
tensor::EmptyOp::create(rewriter, concatOp.getLoc(), outputType.getShape(), outputType.getElementType())
|
||||||
concatOp.getLoc(),
|
|
||||||
outputType.getShape(),
|
|
||||||
outputType.getElementType())
|
|
||||||
.getResult());
|
.getResult());
|
||||||
rewriter.replaceOp(concatOp, newConcat.getOutput());
|
rewriter.replaceOp(concatOp, newConcat.getOutput());
|
||||||
return success();
|
return success();
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
||||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||||
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||||
#include "mlir/IR/PatternMatch.h"
|
#include "mlir/IR/PatternMatch.h"
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
|
|
||||||
|
|
||||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||||
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
|
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
|
||||||
|
|
||||||
#include "llvm/ADT/DenseMap.h"
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/STLExtras.h"
|
||||||
#include "llvm/ADT/SmallPtrSet.h"
|
#include "llvm/ADT/SmallPtrSet.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
#include "llvm/ADT/STLExtras.h"
|
|
||||||
|
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|
||||||
namespace onnx_mlir {
|
namespace onnx_mlir {
|
||||||
@@ -29,9 +29,8 @@ static uint64_t getTypeSizeBytes(MemRefType type) {
|
|||||||
return static_cast<uint64_t>(type.getNumElements() * type.getElementTypeBitWidth() / 8);
|
return static_cast<uint64_t>(type.getNumElements() * type.getElementTypeBitWidth() / 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FailureOr<uint64_t> getLastUseInstruction(memref::AllocOp allocOp,
|
static FailureOr<uint64_t>
|
||||||
Block& body,
|
getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Operation*, uint64_t>& opOrder) {
|
||||||
const DenseMap<Operation*, uint64_t>& opOrder) {
|
|
||||||
uint64_t endInstruction = opOrder.lookup(allocOp);
|
uint64_t endInstruction = opOrder.lookup(allocOp);
|
||||||
SmallPtrSet<Operation*, 16> visited;
|
SmallPtrSet<Operation*, 16> visited;
|
||||||
SmallVector<Value> pendingValues;
|
SmallVector<Value> pendingValues;
|
||||||
@@ -45,10 +44,9 @@ static FailureOr<uint64_t> getLastUseInstruction(memref::AllocOp allocOp,
|
|||||||
if (!visited.insert(user).second)
|
if (!visited.insert(user).second)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (isSupportedAliasOp(user)) {
|
if (isSupportedAliasOp(user))
|
||||||
for (Value result : user->getResults())
|
for (Value result : user->getResults())
|
||||||
pendingValues.push_back(result);
|
pendingValues.push_back(result);
|
||||||
}
|
|
||||||
|
|
||||||
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
||||||
for (OpResult result : user->getResults()) {
|
for (OpResult result : user->getResults()) {
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
|
|
||||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||||
#include "mlir/IR/PatternMatch.h"
|
#include "mlir/IR/PatternMatch.h"
|
||||||
#include "mlir/IR/Operation.h"
|
|
||||||
|
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
|
|||||||
+20
-19
@@ -45,9 +45,7 @@ struct CoalescingReportEntry {
|
|||||||
CoalescingReportRow row;
|
CoalescingReportRow row;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::string formatMemory(uint64_t bytes) {
|
static std::string formatMemory(uint64_t bytes) { return formatReportMemory(bytes); }
|
||||||
return formatReportMemory(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
||||||
auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
|
auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
|
||||||
@@ -58,9 +56,10 @@ static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
|||||||
static void printReportRow(raw_ostream& os, const CoalescingReportRow& row) {
|
static void printReportRow(raw_ostream& os, const CoalescingReportRow& row) {
|
||||||
llvm::SmallVector<ReportField, 4> fields = {
|
llvm::SmallVector<ReportField, 4> fields = {
|
||||||
{"Number of candidates", std::to_string(row.numCandidates)},
|
{"Number of candidates", std::to_string(row.numCandidates)},
|
||||||
{"Skipped allocations", std::to_string(row.numSkipped)},
|
{"Skipped allocations", std::to_string(row.numSkipped) },
|
||||||
{"Removed allocations", std::to_string(row.numRemoved)},
|
{"Removed allocations", std::to_string(row.numRemoved) },
|
||||||
{"Saved memory", formatMemory(row.savedBytes)}};
|
{"Saved memory", formatMemory(row.savedBytes) }
|
||||||
|
};
|
||||||
printReportFlatFields(os, fields);
|
printReportFlatFields(os, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -87,10 +86,12 @@ static void emitReport(ArrayRef<CoalescingReportEntry> entries) {
|
|||||||
totalRow.savedBytes += entryTotal.savedBytes;
|
totalRow.savedBytes += entryTotal.savedBytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm::SmallVector<ReportField, 4> totalFields = {{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
llvm::SmallVector<ReportField, 4> totalFields = {
|
||||||
{"Skipped allocations", std::to_string(totalRow.numSkipped)},
|
{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
||||||
{"Removed allocations", std::to_string(totalRow.numRemoved)},
|
{"Skipped allocations", std::to_string(totalRow.numSkipped) },
|
||||||
{"Saved memory", formatMemory(totalRow.savedBytes)}};
|
{"Removed allocations", std::to_string(totalRow.numRemoved) },
|
||||||
|
{"Saved memory", formatMemory(totalRow.savedBytes) }
|
||||||
|
};
|
||||||
printReportTotalsBlock(os, totalFields);
|
printReportTotalsBlock(os, totalFields);
|
||||||
if (!entries.empty())
|
if (!entries.empty())
|
||||||
os << "\n";
|
os << "\n";
|
||||||
@@ -127,15 +128,17 @@ static void emitReport(ArrayRef<CoalescingReportEntry> entries) {
|
|||||||
if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) {
|
if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) {
|
||||||
llvm::SmallVector<ReportField, 4> perCoreFields = {
|
llvm::SmallVector<ReportField, 4> perCoreFields = {
|
||||||
{"Number of candidates", std::to_string(sortedEntries[index].row.numCandidates)},
|
{"Number of candidates", std::to_string(sortedEntries[index].row.numCandidates)},
|
||||||
{"Skipped allocations", std::to_string(sortedEntries[index].row.numSkipped)},
|
{"Skipped allocations", std::to_string(sortedEntries[index].row.numSkipped) },
|
||||||
{"Removed allocations", std::to_string(sortedEntries[index].row.numRemoved)},
|
{"Removed allocations", std::to_string(sortedEntries[index].row.numRemoved) },
|
||||||
{"Saved memory", formatMemory(sortedEntries[index].row.savedBytes)}};
|
{"Saved memory", formatMemory(sortedEntries[index].row.savedBytes) }
|
||||||
|
};
|
||||||
CoalescingReportRow totalRow = getTotalRow(sortedEntries[index]);
|
CoalescingReportRow totalRow = getTotalRow(sortedEntries[index]);
|
||||||
llvm::SmallVector<ReportField, 4> totalFields = {
|
llvm::SmallVector<ReportField, 4> totalFields = {
|
||||||
{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
||||||
{"Skipped allocations", std::to_string(totalRow.numSkipped)},
|
{"Skipped allocations", std::to_string(totalRow.numSkipped) },
|
||||||
{"Removed allocations", std::to_string(totalRow.numRemoved)},
|
{"Removed allocations", std::to_string(totalRow.numRemoved) },
|
||||||
{"Saved memory", formatMemory(totalRow.savedBytes)}};
|
{"Saved memory", formatMemory(totalRow.savedBytes) }
|
||||||
|
};
|
||||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
|
printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -196,8 +199,6 @@ struct StaticMemoryCoalescingPass : PassWrapper<StaticMemoryCoalescingPass, Oper
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
std::unique_ptr<Pass> createPimStaticMemoryCoalescingPass() {
|
std::unique_ptr<Pass> createPimStaticMemoryCoalescingPass() { return std::make_unique<StaticMemoryCoalescingPass>(); }
|
||||||
return std::make_unique<StaticMemoryCoalescingPass>();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
} // namespace onnx_mlir
|
||||||
|
|||||||
@@ -818,13 +818,14 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm::SmallVector<ReportField, 6> totalFields = {{"Used cores", std::to_string(usedCpuCount)},
|
llvm::SmallVector<ReportField, 6> totalFields = {
|
||||||
{"Number of top-level compute ops", std::to_string(totalComputeOps)},
|
{"Used cores", std::to_string(usedCpuCount) },
|
||||||
{"Number of logical computes", std::to_string(totalLogicalComputes)},
|
{"Number of top-level compute ops", std::to_string(totalComputeOps) },
|
||||||
{"Number of top-level batch compute ops",
|
{"Number of logical computes", std::to_string(totalLogicalComputes) },
|
||||||
std::to_string(totalBatchComputeOps)},
|
{"Number of top-level batch compute ops", std::to_string(totalBatchComputeOps) },
|
||||||
{"Number of instructions", std::to_string(totalInstructionCount)},
|
{"Number of instructions", std::to_string(totalInstructionCount)},
|
||||||
{"Number of used crossbars", std::to_string(totalWeightCount)}};
|
{"Number of used crossbars", std::to_string(totalWeightCount) }
|
||||||
|
};
|
||||||
printReportTotalsBlock(os, totalFields);
|
printReportTotalsBlock(os, totalFields);
|
||||||
if (!collectedData.empty())
|
if (!collectedData.empty())
|
||||||
os << "\n";
|
os << "\n";
|
||||||
@@ -876,13 +877,15 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
|||||||
|
|
||||||
llvm::SmallVector<ReportField, 3> perCoreFields = {
|
llvm::SmallVector<ReportField, 3> perCoreFields = {
|
||||||
{"Number of logical computes", std::to_string(perCoreLogicalComputeCount)},
|
{"Number of logical computes", std::to_string(perCoreLogicalComputeCount)},
|
||||||
{"Number of instructions", std::to_string(perCoreInstructionCount)},
|
{"Number of instructions", std::to_string(perCoreInstructionCount) },
|
||||||
{"Number of used crossbars", std::to_string(perCoreWeightCount)}};
|
{"Number of used crossbars", std::to_string(perCoreWeightCount) }
|
||||||
|
};
|
||||||
if (current.isRebatched) {
|
if (current.isRebatched) {
|
||||||
llvm::SmallVector<ReportField, 3> totalEntryFields = {
|
llvm::SmallVector<ReportField, 3> totalEntryFields = {
|
||||||
{"Number of logical computes", std::to_string(current.logicalComputeCount)},
|
{"Number of logical computes", std::to_string(current.logicalComputeCount)},
|
||||||
{"Number of instructions", std::to_string(totalEntryInstructionCount)},
|
{"Number of instructions", std::to_string(totalEntryInstructionCount) },
|
||||||
{"Number of used crossbars", std::to_string(current.weightCount)}};
|
{"Number of used crossbars", std::to_string(current.weightCount) }
|
||||||
|
};
|
||||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalEntryFields);
|
printReportPerCoreAndTotalFields(os, perCoreFields, totalEntryFields);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -1003,6 +1006,23 @@ public:
|
|||||||
DenseMap<Value, Value> externalInputMap;
|
DenseMap<Value, Value> externalInputMap;
|
||||||
DenseMap<Value, size_t> weightToIndex;
|
DenseMap<Value, size_t> weightToIndex;
|
||||||
};
|
};
|
||||||
|
struct RemoteSendInfo {
|
||||||
|
ChannelInfo channelInfo;
|
||||||
|
ComputeInstance consumer;
|
||||||
|
size_t inputIndex = 0;
|
||||||
|
size_t consumerOrder = 0;
|
||||||
|
size_t sourceOrder = 0;
|
||||||
|
};
|
||||||
|
struct RemoteReceiveEntry {
|
||||||
|
ChannelInfo channelInfo;
|
||||||
|
ComputeInstance consumer;
|
||||||
|
size_t inputIndex = 0;
|
||||||
|
size_t sourceOrder = 0;
|
||||||
|
};
|
||||||
|
auto getRemoteSendPairKey = [](const ChannelInfo& channelInfo) {
|
||||||
|
return (static_cast<uint64_t>(static_cast<uint32_t>(channelInfo.sourceCoreId)) << 32)
|
||||||
|
| static_cast<uint32_t>(channelInfo.targetCoreId);
|
||||||
|
};
|
||||||
|
|
||||||
auto getTaskInputs = [&](const ScheduledTask& task) {
|
auto getTaskInputs = [&](const ScheduledTask& task) {
|
||||||
SmallVector<Value> inputs;
|
SmallVector<Value> inputs;
|
||||||
@@ -1143,7 +1163,7 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
DenseMap<ComputeInstance, SmallVector<SmallVector<ChannelInfo>>> remoteSendsByTask;
|
DenseMap<ComputeInstance, SmallVector<SmallVector<RemoteSendInfo>>> remoteSendsByTask;
|
||||||
DenseMap<ComputeInstance, SmallVector<std::optional<ChannelInfo>>> remoteInputsByTask;
|
DenseMap<ComputeInstance, SmallVector<std::optional<ChannelInfo>>> remoteInputsByTask;
|
||||||
DenseMap<size_t, SmallVector<Value>> cpuExternalInputs;
|
DenseMap<size_t, SmallVector<Value>> cpuExternalInputs;
|
||||||
DenseMap<size_t, SmallVector<Value>> cpuWeights;
|
DenseMap<size_t, SmallVector<Value>> cpuWeights;
|
||||||
@@ -1176,7 +1196,7 @@ public:
|
|||||||
auto& perResultChannels = remoteSendsByTask[producerRef->instance];
|
auto& perResultChannels = remoteSendsByTask[producerRef->instance];
|
||||||
if (perResultChannels.empty())
|
if (perResultChannels.empty())
|
||||||
perResultChannels.resize(getTaskOutputTypes(producerIt->second).size());
|
perResultChannels.resize(getTaskOutputTypes(producerIt->second).size());
|
||||||
perResultChannels[producerRef->resultIndex].push_back(info);
|
perResultChannels[producerRef->resultIndex].push_back({info, task.key, inputIndex, task.order, 0});
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1201,6 +1221,79 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DenseSet<uint64_t> pairsNeedingReceiveReorder;
|
||||||
|
for (size_t cpu : orderedCpus) {
|
||||||
|
DenseMap<uint64_t, size_t> nextSourceOrderByPair;
|
||||||
|
DenseMap<uint64_t, size_t> lastConsumerOrderByPair;
|
||||||
|
for (const ScheduledTask& task : tasksByCpu[cpu]) {
|
||||||
|
auto sendsIt = remoteSendsByTask.find(task.key);
|
||||||
|
if (sendsIt == remoteSendsByTask.end())
|
||||||
|
continue;
|
||||||
|
for (auto& sendInfos : sendsIt->second) {
|
||||||
|
for (RemoteSendInfo& sendInfo : sendInfos) {
|
||||||
|
uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
|
||||||
|
sendInfo.sourceOrder = nextSourceOrderByPair[pairKey]++;
|
||||||
|
auto [it, inserted] = lastConsumerOrderByPair.try_emplace(pairKey, sendInfo.consumerOrder);
|
||||||
|
if (!inserted) {
|
||||||
|
if (sendInfo.consumerOrder < it->second)
|
||||||
|
pairsNeedingReceiveReorder.insert(pairKey);
|
||||||
|
it->second = sendInfo.consumerOrder;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DenseMap<uint64_t, SmallVector<RemoteSendInfo*>> reorderedSendsByPair;
|
||||||
|
for (auto& taskSends : remoteSendsByTask) {
|
||||||
|
for (auto& sendInfos : taskSends.second) {
|
||||||
|
for (RemoteSendInfo& sendInfo : sendInfos) {
|
||||||
|
uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
|
||||||
|
if (pairsNeedingReceiveReorder.contains(pairKey))
|
||||||
|
reorderedSendsByPair[pairKey].push_back(&sendInfo);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto& pairSends : reorderedSendsByPair) {
|
||||||
|
llvm::stable_sort(pairSends.second, [](const RemoteSendInfo* lhs, const RemoteSendInfo* rhs) {
|
||||||
|
if (lhs->sourceOrder != rhs->sourceOrder)
|
||||||
|
return lhs->sourceOrder < rhs->sourceOrder;
|
||||||
|
return lhs->channelInfo.channelId < rhs->channelInfo.channelId;
|
||||||
|
});
|
||||||
|
for (RemoteSendInfo* sendInfo : pairSends.second) {
|
||||||
|
int64_t channelId = nextChannelId++;
|
||||||
|
sendInfo->channelInfo.channelId = channelId;
|
||||||
|
auto remoteInputsIt = remoteInputsByTask.find(sendInfo->consumer);
|
||||||
|
assert(remoteInputsIt != remoteInputsByTask.end() && "missing remote input for reordered send");
|
||||||
|
assert(sendInfo->inputIndex < remoteInputsIt->second.size() && "remote input index out of range");
|
||||||
|
assert(remoteInputsIt->second[sendInfo->inputIndex] && "missing reordered remote input channel");
|
||||||
|
remoteInputsIt->second[sendInfo->inputIndex]->channelId = channelId;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DenseMap<size_t, DenseMap<uint64_t, SmallVector<RemoteReceiveEntry>>> receiveQueuesByCpu;
|
||||||
|
for (auto& taskSends : remoteSendsByTask) {
|
||||||
|
for (const auto& sendInfos : taskSends.second) {
|
||||||
|
for (const RemoteSendInfo& sendInfo : sendInfos) {
|
||||||
|
uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
|
||||||
|
if (!pairsNeedingReceiveReorder.contains(pairKey))
|
||||||
|
continue;
|
||||||
|
size_t targetCpu = static_cast<size_t>(sendInfo.channelInfo.targetCoreId - 1);
|
||||||
|
receiveQueuesByCpu[targetCpu][pairKey].push_back(
|
||||||
|
{sendInfo.channelInfo, sendInfo.consumer, sendInfo.inputIndex, sendInfo.sourceOrder});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto& cpuQueues : receiveQueuesByCpu) {
|
||||||
|
for (auto& pairQueue : cpuQueues.second) {
|
||||||
|
llvm::stable_sort(pairQueue.second, [](const RemoteReceiveEntry& lhs, const RemoteReceiveEntry& rhs) {
|
||||||
|
if (lhs.sourceOrder != rhs.sourceOrder)
|
||||||
|
return lhs.sourceOrder < rhs.sourceOrder;
|
||||||
|
return lhs.channelInfo.channelId < rhs.channelInfo.channelId;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto returnOp = cast<func::ReturnOp>(func.getBody().front().getTerminator());
|
auto returnOp = cast<func::ReturnOp>(func.getBody().front().getTerminator());
|
||||||
IRRewriter rewriter(&getContext());
|
IRRewriter rewriter(&getContext());
|
||||||
DenseMap<size_t, CpuProgram> cpuPrograms;
|
DenseMap<size_t, CpuProgram> cpuPrograms;
|
||||||
@@ -1255,6 +1348,59 @@ public:
|
|||||||
CpuProgram& program = cpuPrograms[cpu];
|
CpuProgram& program = cpuPrograms[cpu];
|
||||||
IRRewriter cpuRewriter(&getContext());
|
IRRewriter cpuRewriter(&getContext());
|
||||||
cpuRewriter.setInsertionPointToEnd(program.block);
|
cpuRewriter.setInsertionPointToEnd(program.block);
|
||||||
|
DenseMap<uint64_t, size_t> receiveQueueIndices;
|
||||||
|
DenseMap<ComputeInstance, SmallVector<Value>> preReceivedInputsByTask;
|
||||||
|
|
||||||
|
auto lookupPreReceivedInput = [&](ComputeInstance consumer, size_t inputIndex) -> std::optional<Value> {
|
||||||
|
auto inputsIt = preReceivedInputsByTask.find(consumer);
|
||||||
|
if (inputsIt == preReceivedInputsByTask.end() || inputsIt->second.size() <= inputIndex)
|
||||||
|
return std::nullopt;
|
||||||
|
Value value = inputsIt->second[inputIndex];
|
||||||
|
if (!value)
|
||||||
|
return std::nullopt;
|
||||||
|
return value;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto receiveThroughInput = [&](const ChannelInfo& requestedChannelInfo,
|
||||||
|
ComputeInstance requestedConsumer,
|
||||||
|
size_t requestedInputIndex) -> std::optional<Value> {
|
||||||
|
uint64_t pairKey = getRemoteSendPairKey(requestedChannelInfo);
|
||||||
|
auto cpuQueuesIt = receiveQueuesByCpu.find(cpu);
|
||||||
|
if (cpuQueuesIt == receiveQueuesByCpu.end())
|
||||||
|
return std::nullopt;
|
||||||
|
auto queueIt = cpuQueuesIt->second.find(pairKey);
|
||||||
|
if (queueIt == cpuQueuesIt->second.end())
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
auto& queue = queueIt->second;
|
||||||
|
size_t& queueIndex = receiveQueueIndices[pairKey];
|
||||||
|
while (queueIndex < queue.size()) {
|
||||||
|
const RemoteReceiveEntry& entry = queue[queueIndex++];
|
||||||
|
auto consumerTaskIt = taskByKey.find(entry.consumer);
|
||||||
|
if (consumerTaskIt == taskByKey.end())
|
||||||
|
return std::nullopt;
|
||||||
|
SmallVector<Value> consumerInputs = getTaskInputs(consumerTaskIt->second);
|
||||||
|
if (consumerInputs.size() <= entry.inputIndex)
|
||||||
|
return std::nullopt;
|
||||||
|
Type inputType = consumerInputs[entry.inputIndex].getType();
|
||||||
|
auto receive =
|
||||||
|
spatial::SpatChannelReceiveOp::create(cpuRewriter,
|
||||||
|
loc,
|
||||||
|
inputType,
|
||||||
|
cpuRewriter.getI64IntegerAttr(entry.channelInfo.channelId),
|
||||||
|
cpuRewriter.getI32IntegerAttr(entry.channelInfo.sourceCoreId),
|
||||||
|
cpuRewriter.getI32IntegerAttr(entry.channelInfo.targetCoreId));
|
||||||
|
|
||||||
|
auto& receivedInputs = preReceivedInputsByTask[entry.consumer];
|
||||||
|
if (receivedInputs.size() <= entry.inputIndex)
|
||||||
|
receivedInputs.resize(entry.inputIndex + 1);
|
||||||
|
receivedInputs[entry.inputIndex] = receive.getResult();
|
||||||
|
|
||||||
|
if (entry.consumer == requestedConsumer && entry.inputIndex == requestedInputIndex)
|
||||||
|
return receive.getResult();
|
||||||
|
}
|
||||||
|
return std::nullopt;
|
||||||
|
};
|
||||||
|
|
||||||
for (const ScheduledTask& task : tasksByCpu[cpu]) {
|
for (const ScheduledTask& task : tasksByCpu[cpu]) {
|
||||||
SmallVector<Value> taskInputs = getTaskInputs(task);
|
SmallVector<Value> taskInputs = getTaskInputs(task);
|
||||||
@@ -1284,6 +1430,24 @@ public:
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const ChannelInfo& channelInfo = *remoteInputsIt->second[inputIndex];
|
const ChannelInfo& channelInfo = *remoteInputsIt->second[inputIndex];
|
||||||
|
uint64_t pairKey = getRemoteSendPairKey(channelInfo);
|
||||||
|
if (pairsNeedingReceiveReorder.contains(pairKey)) {
|
||||||
|
if (std::optional<Value> preReceived = lookupPreReceivedInput(task.key, inputIndex)) {
|
||||||
|
resolvedInputs.push_back(*preReceived);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::optional<Value> received = receiveThroughInput(channelInfo, task.key, inputIndex);
|
||||||
|
if (!received) {
|
||||||
|
task.sourceOp->emitOpError("failed to materialize reordered remote receive")
|
||||||
|
<< " consumerCpu=" << cpu << " consumerSlot=" << task.slot
|
||||||
|
<< " sourceCoreId=" << channelInfo.sourceCoreId << " targetCoreId=" << channelInfo.targetCoreId
|
||||||
|
<< " channelId=" << channelInfo.channelId;
|
||||||
|
signalPassFailure();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
resolvedInputs.push_back(*received);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
auto receive =
|
auto receive =
|
||||||
spatial::SpatChannelReceiveOp::create(cpuRewriter,
|
spatial::SpatChannelReceiveOp::create(cpuRewriter,
|
||||||
loc,
|
loc,
|
||||||
@@ -1367,16 +1531,17 @@ public:
|
|||||||
if (sendInfos.empty())
|
if (sendInfos.empty())
|
||||||
continue;
|
continue;
|
||||||
Value producedValue = taskYieldValues[resultIndex];
|
Value producedValue = taskYieldValues[resultIndex];
|
||||||
for (const ChannelInfo& sendInfo : sendInfos)
|
for (const RemoteSendInfo& sendInfo : sendInfos) {
|
||||||
spatial::SpatChannelSendOp::create(cpuRewriter,
|
spatial::SpatChannelSendOp::create(cpuRewriter,
|
||||||
loc,
|
loc,
|
||||||
cpuRewriter.getI64IntegerAttr(sendInfo.channelId),
|
cpuRewriter.getI64IntegerAttr(sendInfo.channelInfo.channelId),
|
||||||
cpuRewriter.getI32IntegerAttr(sendInfo.sourceCoreId),
|
cpuRewriter.getI32IntegerAttr(sendInfo.channelInfo.sourceCoreId),
|
||||||
cpuRewriter.getI32IntegerAttr(sendInfo.targetCoreId),
|
cpuRewriter.getI32IntegerAttr(sendInfo.channelInfo.targetCoreId),
|
||||||
producedValue);
|
producedValue);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
SmallVector<Value> yieldValues;
|
SmallVector<Value> yieldValues;
|
||||||
yieldValues.reserve(cpuExternalOutputs[cpu].size());
|
yieldValues.reserve(cpuExternalOutputs[cpu].size());
|
||||||
@@ -1666,16 +1831,14 @@ private:
|
|||||||
IRRewriter rewriter(context);
|
IRRewriter rewriter(context);
|
||||||
|
|
||||||
rewriter.setInsertionPointAfter(producerOp);
|
rewriter.setInsertionPointAfter(producerOp);
|
||||||
auto savedSendInsertPoint = rewriter.saveInsertionPoint();
|
auto insertNew = [this, context, loc, computeValueResults, producerCpu](size_t resultIndex, size_t targetCpu) {
|
||||||
auto insertNew = [this, savedSendInsertPoint, context, loc, computeValueResults, producerCpu](size_t resultIndex,
|
|
||||||
size_t targetCpu) {
|
|
||||||
auto channelId = nextChannelId++;
|
auto channelId = nextChannelId++;
|
||||||
LazyInsertComputeResult::ChannelInfo channelInfo {
|
LazyInsertComputeResult::ChannelInfo channelInfo {
|
||||||
channelId, getPhysicalCoreId(producerCpu), getPhysicalCoreId(targetCpu)};
|
channelId, getPhysicalCoreId(producerCpu), getPhysicalCoreId(targetCpu)};
|
||||||
auto insertVal = [&context, loc, computeValueResults, channelInfo, resultIndex, savedSendInsertPoint](
|
auto insertVal =
|
||||||
mlir::IRRewriter::InsertPoint) {
|
[&context, loc, computeValueResults, channelInfo, resultIndex](mlir::IRRewriter::InsertPoint insertPoint) {
|
||||||
IRRewriter rewriter(context);
|
IRRewriter rewriter(context);
|
||||||
rewriter.restoreInsertionPoint(savedSendInsertPoint);
|
rewriter.restoreInsertionPoint(insertPoint);
|
||||||
spatial::SpatChannelSendOp::create(rewriter,
|
spatial::SpatChannelSendOp::create(rewriter,
|
||||||
loc,
|
loc,
|
||||||
rewriter.getI64IntegerAttr(channelInfo.channelId),
|
rewriter.getI64IntegerAttr(channelInfo.channelId),
|
||||||
|
|||||||
@@ -10,8 +10,6 @@
|
|||||||
#include "llvm/ADT/STLExtras.h"
|
#include "llvm/ADT/STLExtras.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
#include <tuple>
|
|
||||||
|
|
||||||
#include "RegularOpCompaction.hpp"
|
#include "RegularOpCompaction.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
@@ -340,7 +338,18 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
|
|||||||
++runIt;
|
++runIt;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (run.size() > 1) {
|
bool hasRepeatedEndpoint = false;
|
||||||
|
for (size_t lhs = 0; lhs < run.size() && !hasRepeatedEndpoint; ++lhs) {
|
||||||
|
for (size_t rhs = lhs + 1; rhs < run.size(); ++rhs) {
|
||||||
|
if (run[lhs].getSourceCoreId() == run[rhs].getSourceCoreId()
|
||||||
|
&& run[lhs].getTargetCoreId() == run[rhs].getTargetCoreId()) {
|
||||||
|
hasRepeatedEndpoint = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (run.size() > 1 && !hasRepeatedEndpoint) {
|
||||||
struct ReceiveEntry {
|
struct ReceiveEntry {
|
||||||
spatial::SpatChannelReceiveOp op;
|
spatial::SpatChannelReceiveOp op;
|
||||||
size_t originalIndex = 0;
|
size_t originalIndex = 0;
|
||||||
@@ -352,10 +361,6 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
|
|||||||
sortedEntries.reserve(run.size());
|
sortedEntries.reserve(run.size());
|
||||||
for (auto [originalIndex, op] : llvm::enumerate(run))
|
for (auto [originalIndex, op] : llvm::enumerate(run))
|
||||||
sortedEntries.push_back({op, originalIndex, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
|
sortedEntries.push_back({op, originalIndex, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
|
||||||
llvm::stable_sort(sortedEntries, [](const ReceiveEntry& lhs, const ReceiveEntry& rhs) {
|
|
||||||
return std::tuple(lhs.sourceCoreId, lhs.targetCoreId, lhs.channelId)
|
|
||||||
< std::tuple(rhs.sourceCoreId, rhs.targetCoreId, rhs.channelId);
|
|
||||||
});
|
|
||||||
|
|
||||||
SmallVector<int64_t> channelIds;
|
SmallVector<int64_t> channelIds;
|
||||||
SmallVector<int32_t> sourceCoreIds;
|
SmallVector<int32_t> sourceCoreIds;
|
||||||
@@ -436,10 +441,6 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
|
|||||||
sortedEntries.reserve(run.size());
|
sortedEntries.reserve(run.size());
|
||||||
for (auto op : run)
|
for (auto op : run)
|
||||||
sortedEntries.push_back({op, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
|
sortedEntries.push_back({op, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
|
||||||
llvm::stable_sort(sortedEntries, [](const SendEntry& lhs, const SendEntry& rhs) {
|
|
||||||
return std::tuple(lhs.sourceCoreId, lhs.targetCoreId, lhs.channelId)
|
|
||||||
< std::tuple(rhs.sourceCoreId, rhs.targetCoreId, rhs.channelId);
|
|
||||||
});
|
|
||||||
|
|
||||||
SmallVector<int64_t> channelIds;
|
SmallVector<int64_t> channelIds;
|
||||||
SmallVector<int32_t> sourceCoreIds;
|
SmallVector<int32_t> sourceCoreIds;
|
||||||
|
|||||||
@@ -66,8 +66,10 @@ static Value buildSubviewChunk(const StaticSubviewInfo& info,
|
|||||||
return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
|
return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
|
||||||
}
|
}
|
||||||
|
|
||||||
static SmallVector<Value>
|
static SmallVector<Value> delinearizeIndexValue(Value linearIndex,
|
||||||
delinearizeIndexValue(Value linearIndex, ArrayRef<int64_t> shape, ArrayRef<int64_t> strides, PatternRewriter& rewriter) {
|
ArrayRef<int64_t> shape,
|
||||||
|
ArrayRef<int64_t> strides,
|
||||||
|
PatternRewriter& rewriter) {
|
||||||
SmallVector<Value> indices;
|
SmallVector<Value> indices;
|
||||||
indices.reserve(shape.size());
|
indices.reserve(shape.size());
|
||||||
|
|
||||||
@@ -112,7 +114,8 @@ static Value buildDynamicSubviewChunk(const StaticSubviewInfo& info,
|
|||||||
assert(info.strides[dim] == 1 && "loop-based subview rewrite requires unit strides");
|
assert(info.strides[dim] == 1 && "loop-based subview rewrite requires unit strides");
|
||||||
chunkOffsets.push_back(addDynamicOffset(info.offsets[dim], outerIndices[dim], rewriter));
|
chunkOffsets.push_back(addDynamicOffset(info.offsets[dim], outerIndices[dim], rewriter));
|
||||||
chunkSizes.push_back(rewriter.getIndexAttr(1));
|
chunkSizes.push_back(rewriter.getIndexAttr(1));
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
chunkOffsets.push_back(info.offsets[dim]);
|
chunkOffsets.push_back(info.offsets[dim]);
|
||||||
chunkSizes.push_back(rewriter.getIndexAttr(info.sizes.back()));
|
chunkSizes.push_back(rewriter.getIndexAttr(info.sizes.back()));
|
||||||
}
|
}
|
||||||
@@ -122,11 +125,8 @@ static Value buildDynamicSubviewChunk(const StaticSubviewInfo& info,
|
|||||||
return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
|
return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Value buildContiguousChunk(Value source,
|
static Value buildContiguousChunk(
|
||||||
ArrayRef<int64_t> copyShape,
|
Value source, ArrayRef<int64_t> copyShape, ArrayRef<Value> outerIndices, Location loc, PatternRewriter& rewriter) {
|
||||||
ArrayRef<Value> outerIndices,
|
|
||||||
Location loc,
|
|
||||||
PatternRewriter& rewriter) {
|
|
||||||
SmallVector<OpFoldResult> chunkOffsets;
|
SmallVector<OpFoldResult> chunkOffsets;
|
||||||
SmallVector<OpFoldResult> chunkSizes;
|
SmallVector<OpFoldResult> chunkSizes;
|
||||||
SmallVector<OpFoldResult> chunkStrides;
|
SmallVector<OpFoldResult> chunkStrides;
|
||||||
@@ -203,7 +203,8 @@ static LogicalResult rewriteSubviewCopyLikeOp(CopyOp copyOp,
|
|||||||
rewriter.setInsertionPointToStart(loop.getBody());
|
rewriter.setInsertionPointToStart(loop.getBody());
|
||||||
|
|
||||||
SmallVector<Value> outerIndices =
|
SmallVector<Value> outerIndices =
|
||||||
outerShape.empty() ? SmallVector<Value> {} : delinearizeIndexValue(loop.getInductionVar(), outerShape, outerStrides, rewriter);
|
outerShape.empty() ? SmallVector<Value> {}
|
||||||
|
: delinearizeIndexValue(loop.getInductionVar(), outerShape, outerStrides, rewriter);
|
||||||
Value chunkDst = splitDst ? buildDynamicSubviewChunk(*dstSubview, outerIndices, copyOp.getLoc(), rewriter)
|
Value chunkDst = splitDst ? buildDynamicSubviewChunk(*dstSubview, outerIndices, copyOp.getLoc(), rewriter)
|
||||||
: buildContiguousChunk(dst, copyShape, outerIndices, copyOp.getLoc(), rewriter);
|
: buildContiguousChunk(dst, copyShape, outerIndices, copyOp.getLoc(), rewriter);
|
||||||
Value chunkSrc = splitSrc ? buildDynamicSubviewChunk(*srcSubview, outerIndices, copyOp.getLoc(), rewriter)
|
Value chunkSrc = splitSrc ? buildDynamicSubviewChunk(*srcSubview, outerIndices, copyOp.getLoc(), rewriter)
|
||||||
|
|||||||
@@ -6,10 +6,10 @@
|
|||||||
|
|
||||||
#include "llvm/ADT/STLExtras.h"
|
#include "llvm/ADT/STLExtras.h"
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
|
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ def main():
|
|||||||
ap.add_argument("--simulator-dir", default=None,
|
ap.add_argument("--simulator-dir", default=None,
|
||||||
help="Path to pim-simulator crate root (default: auto-detected relative to script).")
|
help="Path to pim-simulator crate root (default: auto-detected relative to script).")
|
||||||
ap.add_argument("--threshold", type=float, default=1e-3, help="Max allowed diff per output element.")
|
ap.add_argument("--threshold", type=float, default=1e-3, help="Max allowed diff per output element.")
|
||||||
|
ap.add_argument("--seed", type=int, default=0, help="RNG seed for generated validation inputs.")
|
||||||
ap.add_argument("--crossbar-size", type=int, default=64)
|
ap.add_argument("--crossbar-size", type=int, default=64)
|
||||||
ap.add_argument("--crossbar-count", type=int, default=8)
|
ap.add_argument("--crossbar-count", type=int, default=8)
|
||||||
ap.add_argument("--core-count", type=int, default=None,
|
ap.add_argument("--core-count", type=int, default=None,
|
||||||
@@ -117,6 +118,7 @@ def main():
|
|||||||
onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
|
onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
|
||||||
crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
|
crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
|
||||||
threshold=a.threshold,
|
threshold=a.threshold,
|
||||||
|
seed=a.seed,
|
||||||
reporter=reporter,
|
reporter=reporter,
|
||||||
model_index=index,
|
model_index=index,
|
||||||
model_total=len(onnx_files),
|
model_total=len(onnx_files),
|
||||||
|
|||||||
@@ -268,7 +268,7 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1
|
|||||||
|
|
||||||
def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||||
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
|
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
|
||||||
reporter=None, model_index=1, model_total=1, verbose=False):
|
seed=0, reporter=None, model_index=1, model_total=1, verbose=False):
|
||||||
network_onnx_path = Path(network_onnx_path).resolve()
|
network_onnx_path = Path(network_onnx_path).resolve()
|
||||||
raptor_path = Path(raptor_path).resolve()
|
raptor_path = Path(raptor_path).resolve()
|
||||||
onnx_include_dir = Path(onnx_include_dir).resolve()
|
onnx_include_dir = Path(onnx_include_dir).resolve()
|
||||||
@@ -306,7 +306,7 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
|||||||
|
|
||||||
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Generate Inputs")
|
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Generate Inputs")
|
||||||
inputs_descriptor, outputs_descriptor = onnx_io(network_onnx_path)
|
inputs_descriptor, outputs_descriptor = onnx_io(network_onnx_path)
|
||||||
inputs_list, _inputs_dict = gen_random_inputs(inputs_descriptor)
|
inputs_list, _inputs_dict = gen_random_inputs(inputs_descriptor, seed=seed)
|
||||||
flags, _files = save_inputs_to_files(network_onnx_path, inputs_list, out_dir=workspace_dir / "inputs")
|
flags, _files = save_inputs_to_files(network_onnx_path, inputs_list, out_dir=workspace_dir / "inputs")
|
||||||
print_info(reporter, f"Saved {len(inputs_list)} input file(s) to {workspace_dir / 'inputs'}")
|
print_info(reporter, f"Saved {len(inputs_list)} input file(s) to {workspace_dir / 'inputs'}")
|
||||||
reporter.advance()
|
reporter.advance()
|
||||||
|
|||||||
Reference in New Issue
Block a user