Merge branch 'main' of chef.heaplab.deib.polimi.it:nnicolosi/Raptor into main
All checks were successful
Validate Operations / validate-operations (push) Successful in 16m23s

This commit is contained in:
ilgeco
2026-04-09 19:27:05 +02:00
43 changed files with 890 additions and 94 deletions

View File

@@ -52,6 +52,7 @@ static NAMES: LazyLock<HashMap<usize, &'static str>> = LazyLock::new(|| {
add_name_simd!(hash, vrelu); add_name_simd!(hash, vrelu);
add_name_simd!(hash, vtanh); add_name_simd!(hash, vtanh);
add_name_simd!(hash, vsigm); add_name_simd!(hash, vsigm);
add_name_simd!(hash, vsoftmax);
add_name!(hash, vmv); add_name!(hash, vmv);
add_name!(hash, vrsu); add_name!(hash, vrsu);
add_name!(hash, vrsl); add_name!(hash, vrsl);
@@ -177,6 +178,7 @@ static SIMD: LazyLock<HashMap<usize, HashMap<(usize, usize), InstructionType>>>
add_simd_to_map!(storage, vrelu); add_simd_to_map!(storage, vrelu);
add_simd_to_map!(storage, vtanh); add_simd_to_map!(storage, vtanh);
add_simd_to_map!(storage, vsigm); add_simd_to_map!(storage, vsigm);
add_simd_to_map!(storage, vsoftmax);
add_simd_to_map!(storage, mvmul); add_simd_to_map!(storage, mvmul);
storage storage
}); });
@@ -626,6 +628,46 @@ where
Ok(InstructionStatus::Completed) Ok(InstructionStatus::Completed)
} }
pub fn vsoftmax(cores: &mut CPU, data: InstructionData) -> Result<InstructionStatus> {
panic!("You are calling a placeholder, the real call is the generic version");
}
pub(super) fn vsoftmax_impl<F, T>(cores: &mut CPU, data: InstructionData) -> Result<InstructionStatus>
where
[F]: UpcastSlice<T>,
T: UpcastDestTraits<T> + MemoryStorable,
F: UpcastDestTraits<F> + MemoryStorable + From<f32>,
{
TRACER.lock().unwrap().pre_vsoftmax::<F,T>(cores, data);
let (core_indx, rd, r1, r2, imm_len, offset_select, offset_value) =
data.get_core_rd_r1_r2_immlen_offset();
let core = cores.core(core_indx);
let r1_val = core.register(r1);
let rd_val = core.register(rd);
let r1_val = add_offset_r1(r1_val, offset_select, offset_value);
let rd_val = add_offset_rd(rd_val, offset_select, offset_value);
let loads = core.reserve_load(r1_val, imm_len)?.execute_load::<F>()?;
let load1 = loads[0];
ensure!(!load1.is_empty(), "vsoftmax does not support empty vectors");
let max_val = load1
.iter()
.copied()
.reduce(|a, b| if a > b { a } else { b })
.unwrap();
let exp_values: Vec<F> = load1.iter().map(|&a| (a - max_val).exp()).collect();
let sum = exp_values
.iter()
.copied()
.reduce(|a, b| a + b)
.unwrap();
ensure!(sum > 0.0.into(), "vsoftmax normalization sum must be positive");
let res: Vec<F> = exp_values.iter().map(|&a| a / sum).collect();
let res_up: Cow<[T]> = res.as_slice().up();
core.execute_store(rd_val, res_up.as_ref());
TRACER.lock().unwrap().post_vsoftmax::<F,T>(cores, data);
Ok(InstructionStatus::Completed)
}
pub fn vmv(cores: &mut CPU, data: InstructionData) -> Result<InstructionStatus> { pub fn vmv(cores: &mut CPU, data: InstructionData) -> Result<InstructionStatus> {
todo!() todo!()
} }

View File

@@ -40,6 +40,7 @@ static SIMD: LazyLock<HashMap<String, FunctorType>> = LazyLock::new(|| {
add_to_json_map!(storage, vrelu); add_to_json_map!(storage, vrelu);
add_to_json_map!(storage, vtanh); add_to_json_map!(storage, vtanh);
add_to_json_map!(storage, vsigm); add_to_json_map!(storage, vsigm);
add_to_json_map!(storage, vsoftmax);
add_to_json_map!(storage, vmv); add_to_json_map!(storage, vmv);
add_to_json_map!(storage, vrsu); add_to_json_map!(storage, vrsu);
add_to_json_map!(storage, vrsl); add_to_json_map!(storage, vrsl);
@@ -417,6 +418,27 @@ fn json_to_vsigm(
Ok(()) Ok(())
} }
fn json_to_vsoftmax(
inst_builder: &mut InstructionsBuilder,
inst_data_builder: &mut InstructionDataBuilder,
json: &Value,
) -> Result<()> {
let json = json.as_object().expect("Not an object");
assert_eq!("vsoftmax", json_str!(json, "op"));
let rd = json_i64!(json, "rd") as i32;
let rs1 = json_i64!(json, "rs1") as i32;
let len = json_i64!(json, "len") as i32;
let (offset_select, offset_value) = json_to_offset(json.get("offset").unwrap());
inst_data_builder
.set_rd(rd)
.set_r1(rs1)
.set_imm_len(len)
.set_offset_select(offset_select)
.set_offset_value(offset_value);
inst_builder.make_inst(vsoftmax, inst_data_builder.build());
Ok(())
}
fn json_to_vmv( fn json_to_vmv(
inst_builder: &mut InstructionsBuilder, inst_builder: &mut InstructionsBuilder,
inst_data_builder: &mut InstructionDataBuilder, inst_data_builder: &mut InstructionDataBuilder,

View File

@@ -67,6 +67,22 @@ impl HasSigm for f64 {
} }
} }
pub trait HasExp {
fn exp(self) -> Self;
}
impl HasExp for f32 {
fn exp(self) -> Self {
self.exp()
}
}
impl HasExp for f64 {
fn exp(self) -> Self {
self.exp()
}
}
pub trait TryToUsize: TryInto<usize, Error = Self::TryError> pub trait TryToUsize: TryInto<usize, Error = Self::TryError>
@@ -112,6 +128,7 @@ pub trait UpcastDestTraits<T>:
+ PartialOrd<T> + PartialOrd<T>
+ HasTanh + HasTanh
+ HasSigm + HasSigm
+ HasExp
+ FromUsize + FromUsize
{ {
} }

View File

@@ -248,6 +248,22 @@ impl Trace {
{ {
} }
pub fn pre_vsoftmax<F, T>(&mut self, cores: &mut CPU, data: InstructionData)
where
[F]: UpcastSlice<T>,
T: UpcastDestTraits<T> + MemoryStorable,
F: UpcastDestTraits<F> + MemoryStorable + From<f32>,
{
}
pub fn post_vsoftmax<F, T>(&mut self, cores: &mut CPU, data: InstructionData)
where
[F]: UpcastSlice<T>,
T: UpcastDestTraits<T> + MemoryStorable,
F: UpcastDestTraits<F> + MemoryStorable + From<f32>,
{
}
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
/////Communication/synchronization Instructions///////////////// /////Communication/synchronization Instructions/////////////////
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////

View File

@@ -956,6 +956,35 @@ impl Trace {
// Ok(InstructionStatus::Completed) // Ok(InstructionStatus::Completed)
} }
pub fn pre_vsoftmax<F, T>(&mut self, cores: &mut CPU, data: InstructionData)
where
[F]: UpcastSlice<T>,
T: UpcastDestTraits<T> + MemoryStorable,
F: UpcastDestTraits<F> + MemoryStorable + From<f32>,
{
let (core_indx, rd, r1, r2, imm_len, offset_select, offset_value) =
data.get_core_rd_r1_r2_immlen_offset();
let file: &mut File = self
.out_files
.get_mut(core_indx as usize)
.expect("File at index not found");
writeln!(file, "\t\tVSOFTMAX\t\t");
}
pub fn post_vsoftmax<F, T>(&mut self, cores: &mut CPU, data: InstructionData)
where
[F]: UpcastSlice<T>,
T: UpcastDestTraits<T> + MemoryStorable,
F: UpcastDestTraits<F> + MemoryStorable + From<f32>,
{
let (core_indx, rd, r1, r2, imm_len, offset_select, offset_value) =
data.get_core_rd_r1_r2_immlen_offset();
let file: &mut File = self
.out_files
.get_mut(core_indx as usize)
.expect("File at index not found");
}
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
/////Communication/synchronization Instructions///////////////// /////Communication/synchronization Instructions/////////////////
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////

View File

@@ -386,6 +386,20 @@ void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp) const {
emitInstruction(std::move(json)); emitInstruction(std::move(json));
} }
void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp) const {
auto outputBufferAddr = memory.getValueAddress(vsoftmaxOp.getOutputBuffer());
auto inputAddr = memory.getValueAddress(vsoftmaxOp.getInput());
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
json::Object json;
json["op"] = "vsoftmax";
json["rd"] = 0;
json["rs1"] = 1;
json["offset"] = createEmptyOffset();
json["len"] = getValueSizeInBytes(vsoftmaxOp.getInput());
emitInstruction(std::move(json));
}
void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp) const { void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp) const {
auto srcAddr = memory.getValueAddress(transposeOp.getInput()); auto srcAddr = memory.getValueAddress(transposeOp.getInput());
auto dstAddr = memory.getValueAddress(transposeOp.getOutputBuffer()); auto dstAddr = memory.getValueAddress(transposeOp.getOutputBuffer());
@@ -537,6 +551,8 @@ static int64_t codeGenCoreOps(pim::PimCoreOp coreOp, PimCodeGen& coreCodeGen) {
coreCodeGen.codeGenVTanhOp(vtanhOp); coreCodeGen.codeGenVTanhOp(vtanhOp);
else if (auto vsigmOp = dyn_cast<pim::PimVSigmOp>(op)) else if (auto vsigmOp = dyn_cast<pim::PimVSigmOp>(op))
coreCodeGen.codeGenVSigmOp(vsigmOp); coreCodeGen.codeGenVSigmOp(vsigmOp);
else if (auto vsoftmaxOp = dyn_cast<pim::PimVSoftmaxOp>(op))
coreCodeGen.codeGenVSoftmaxOp(vsoftmaxOp);
else { else {
op.emitError("Unsupported codegen for this operation"); op.emitError("Unsupported codegen for this operation");
op.dump(); op.dump();

View File

@@ -99,6 +99,7 @@ public:
void codeGenVReluOp(pim::PimVReluOp vreluOp) const; void codeGenVReluOp(pim::PimVReluOp vreluOp) const;
void codeGenVTanhOp(pim::PimVTanhOp vtanhOp) const; void codeGenVTanhOp(pim::PimVTanhOp vtanhOp) const;
void codeGenVSigmOp(pim::PimVSigmOp vsigmOp) const; void codeGenVSigmOp(pim::PimVSigmOp vsigmOp) const;
void codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp) const;
void codeGenTransposeOp(pim::PimTransposeOp transposeOp) const; void codeGenTransposeOp(pim::PimTransposeOp transposeOp) const;
}; };

View File

@@ -11,8 +11,12 @@ add_pim_library(OMONNXToSpatial
Patterns/NN/Pool.cpp Patterns/NN/Pool.cpp
Patterns/NN/Relu.cpp Patterns/NN/Relu.cpp
Patterns/NN/Sigmoid.cpp Patterns/NN/Sigmoid.cpp
Patterns/NN/Softmax.cpp
Patterns/Tensor/Concat.cpp Patterns/Tensor/Concat.cpp
Patterns/Tensor/Gather.cpp
Patterns/Tensor/Resize.cpp
Patterns/Tensor/Reshape.cpp Patterns/Tensor/Reshape.cpp
Patterns/Tensor/Split.cpp
ONNXToSpatialPass.cpp ONNXToSpatialPass.cpp
Common.cpp Common.cpp

View File

@@ -89,9 +89,12 @@ void ONNXToSpatialPass::runOnOperation() {
target.addIllegalOp<ONNXSigmoidOp>(); target.addIllegalOp<ONNXSigmoidOp>();
target.addIllegalOp<ONNXSoftmaxOp>(); target.addIllegalOp<ONNXSoftmaxOp>();
target.addIllegalOp<ONNXConcatOp>(); target.addIllegalOp<ONNXConcatOp>();
target.addIllegalOp<ONNXGatherOp>();
target.addIllegalOp<ONNXReshapeOp>(); target.addIllegalOp<ONNXReshapeOp>();
target.addIllegalOp<ONNXResizeOp>();
target.addIllegalOp<ONNXLRNOp>(); target.addIllegalOp<ONNXLRNOp>();
target.addIllegalOp<ONNXReduceMeanV13Op>(); target.addIllegalOp<ONNXReduceMeanV13Op>();
target.addIllegalOp<ONNXSplitOp>();
RewritePatternSet patterns(ctx); RewritePatternSet patterns(ctx);
patterns.add<removeLRN>(ctx); patterns.add<removeLRN>(ctx);
@@ -103,8 +106,12 @@ void ONNXToSpatialPass::runOnOperation() {
populateReduceMeanPatterns(patterns, ctx); populateReduceMeanPatterns(patterns, ctx);
populateReluPatterns(patterns, ctx); populateReluPatterns(patterns, ctx);
populateSigmoidPatterns(patterns, ctx); populateSigmoidPatterns(patterns, ctx);
populateSoftmaxPatterns(patterns, ctx);
populateConcatPatterns(patterns, ctx); populateConcatPatterns(patterns, ctx);
populateGatherPatterns(patterns, ctx);
populateResizePatterns(patterns, ctx);
populateReshapePatterns(patterns, ctx); populateReshapePatterns(patterns, ctx);
populateSplitPatterns(patterns, ctx);
if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) { if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) {
signalPassFailure(); signalPassFailure();
@@ -168,7 +175,7 @@ bool encapsulateConcat(IRRewriter& rewriter, Location loc, Operation* inst) {
auto newCompute = spatial::SpatWeightedCompute::create(rewriter, loc, inst->getResultTypes().front(), sources); auto newCompute = spatial::SpatWeightedCompute::create(rewriter, loc, inst->getResultTypes().front(), sources);
llvm::SmallVector<Type> sourceTypes; llvm::SmallVector<Type> sourceTypes;
llvm::SmallVector<Location> sourceLoc; llvm::SmallVector<Location> sourceLoc;
for (auto source : sources){ for (auto source : sources) {
sourceTypes.push_back(source.getType()); sourceTypes.push_back(source.getType());
sourceLoc.push_back(loc); sourceLoc.push_back(loc);
} }
@@ -176,7 +183,7 @@ bool encapsulateConcat(IRRewriter& rewriter, Location loc, Operation* inst) {
newCompute.getProperties().setOperandSegmentSizes({(int) 0, (int) sources.size()}); newCompute.getProperties().setOperandSegmentSizes({(int) 0, (int) sources.size()});
rewriter.setInsertionPointToEnd(BB); rewriter.setInsertionPointToEnd(BB);
IRMapping mapper; IRMapping mapper;
for(auto [source,bbArg] : llvm::zip(sources, BB->getArguments())) for (auto [source, bbArg] : llvm::zip(sources, BB->getArguments()))
mapper.map(source, bbArg); mapper.map(source, bbArg);
auto newConcat = rewriter.clone(*inst, mapper); auto newConcat = rewriter.clone(*inst, mapper);
spatial::SpatYieldOp::create(rewriter, loc, newConcat->getResult(0)); spatial::SpatYieldOp::create(rewriter, loc, newConcat->getResult(0));

View File

@@ -21,8 +21,16 @@ void populateReluPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext*
void populateSigmoidPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx); void populateSigmoidPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateSoftmaxPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateConcatPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx); void populateConcatPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateGatherPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateResizePatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateReshapePatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx); void populateReshapePatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
void populateSplitPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
} // namespace onnx_mlir } // namespace onnx_mlir

View File

@@ -92,10 +92,8 @@ static FailureOr<Value> materializeBroadcastedConstantTensor(Value value,
return arith::ConstantOp::create(rewriter, loc, resultType, broadcastedAttr).getResult(); return arith::ConstantOp::create(rewriter, loc, resultType, broadcastedAttr).getResult();
} }
static FailureOr<Value> prepareElementwiseOperand(Value value, static FailureOr<Value>
RankedTensorType resultType, prepareElementwiseOperand(Value value, RankedTensorType resultType, ConversionPatternRewriter& rewriter, Location loc) {
ConversionPatternRewriter& rewriter,
Location loc) {
auto valueType = dyn_cast<RankedTensorType>(value.getType()); auto valueType = dyn_cast<RankedTensorType>(value.getType());
if (!valueType || !valueType.hasStaticShape()) if (!valueType || !valueType.hasStaticShape())
return failure(); return failure();

View File

@@ -280,8 +280,8 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp,
for (size_t aSliceId = 0; aSliceId < aHSlices[coreId].size(); aSliceId++) for (size_t aSliceId = 0; aSliceId < aHSlices[coreId].size(); aSliceId++)
weights.push_back(bTiles[outSliceId][coreId][aSliceId]); weights.push_back(bTiles[outSliceId][coreId][aSliceId]);
auto computeOp = auto computeOp = createSpatCompute(
createSpatCompute(rewriter, gemmLoc, currOutHSliceType, weights, aHSlices[coreId], [&](ValueRange aHSlicesArgs) { rewriter, gemmLoc, currOutHSliceType, weights, aHSlices[coreId], [&](ValueRange aHSlicesArgs) {
SmallVector<Value> vmmOutputs; SmallVector<Value> vmmOutputs;
vmmOutputs.reserve(aHSlicesArgs.size()); vmmOutputs.reserve(aHSlicesArgs.size());
for (auto [aHSliceId, computeArg] : llvm::enumerate(aHSlicesArgs)) for (auto [aHSliceId, computeArg] : llvm::enumerate(aHSlicesArgs))

View File

@@ -71,10 +71,8 @@ static SmallVector<ReassociationIndices> buildCollapseReassociation(ArrayRef<boo
return reassociation; return reassociation;
} }
static Value createAverageCompute(Value input, static Value
RankedTensorType resultType, createAverageCompute(Value input, RankedTensorType resultType, ConversionPatternRewriter& rewriter, Location loc) {
ConversionPatternRewriter& rewriter,
Location loc) {
constexpr size_t numInputs = 1; constexpr size_t numInputs = 1;
auto computeOp = createSpatCompute<numInputs>(rewriter, loc, resultType, {}, ValueRange {input}, [&](Value x) { auto computeOp = createSpatCompute<numInputs>(rewriter, loc, resultType, {}, ValueRange {input}, [&](Value x) {
auto avgOp = spatial::SpatVAvgOp::create(rewriter, loc, resultType, x); auto avgOp = spatial::SpatVAvgOp::create(rewriter, loc, resultType, x);
@@ -141,7 +139,8 @@ struct ReduceMeanToSpatialCompute : OpConversionPattern<ONNXReduceMeanV13Op> {
Location loc = reduceMeanOp.getLoc(); Location loc = reduceMeanOp.getLoc();
RankedTensorType leafType = getAllOnesType(inputType, resultType.getElementType()); RankedTensorType leafType = getAllOnesType(inputType, resultType.getElementType());
Value reducedKeepdims = buildReduceMeanKeepdims(adaptor.getData(), reducedAxes, /*axis=*/0, leafType, rewriter, loc); Value reducedKeepdims =
buildReduceMeanKeepdims(adaptor.getData(), reducedAxes, /*axis=*/0, leafType, rewriter, loc);
if (reduceMeanOp.getKeepdims() != 0) { if (reduceMeanOp.getKeepdims() != 0) {
rewriter.replaceOp(reduceMeanOp, reducedKeepdims); rewriter.replaceOp(reduceMeanOp, reducedKeepdims);

View File

@@ -0,0 +1,111 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Transforms/DialectConversion.h"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"
using namespace mlir;
namespace onnx_mlir {
namespace {
static int64_t normalizeAxis(int64_t axis, int64_t rank) { return axis >= 0 ? axis : rank + axis; }
static SmallVector<int64_t> permuteShape(ArrayRef<int64_t> shape, ArrayRef<int64_t> permutation) {
SmallVector<int64_t> permutedShape;
permutedShape.reserve(permutation.size());
for (int64_t axis : permutation)
permutedShape.push_back(shape[axis]);
return permutedShape;
}
static Value createSoftmaxCompute(Value input, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
constexpr size_t numInputs = 1;
auto computeOp =
createSpatCompute<numInputs>(rewriter, loc, TypeRange {inputType}, {}, ValueRange {input}, [&](Value x) {
auto softmaxOp = spatial::SpatSoftmaxOp::create(rewriter, loc, inputType, x);
spatial::SpatYieldOp::create(rewriter, loc, softmaxOp.getResult());
});
return computeOp.getResult(0);
}
static Value
buildSoftmax(Value input, int64_t softmaxAxis, int64_t axis, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
if (axis == inputType.getRank())
return createSoftmaxCompute(input, rewriter, loc);
if (axis == softmaxAxis)
return buildSoftmax(input, softmaxAxis, axis + 1, rewriter, loc);
SmallVector<Value> slices = sliceTensor(input, axis, /*sliceSize=*/1, rewriter, loc);
SmallVector<Value> rebuiltSlices;
rebuiltSlices.reserve(slices.size());
for (Value slice : slices)
rebuiltSlices.push_back(buildSoftmax(slice, softmaxAxis, axis + 1, rewriter, loc));
return rebuiltSlices.size() == 1 ? rebuiltSlices.front()
: tensor::ConcatOp::create(rewriter, loc, axis, rebuiltSlices).getResult();
}
struct SoftmaxToSpatialCompute : OpConversionPattern<ONNXSoftmaxOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult matchAndRewrite(ONNXSoftmaxOp softmaxOp,
ONNXSoftmaxOpAdaptor adaptor,
ConversionPatternRewriter& rewriter) const override {
auto inputType = dyn_cast<RankedTensorType>(adaptor.getInput().getType());
if (!inputType || !inputType.hasStaticShape())
return failure();
int64_t axis = normalizeAxis(softmaxOp.getAxis(), inputType.getRank());
if (axis < 0 || axis >= inputType.getRank())
return failure();
Value input = adaptor.getInput();
Value result;
if (axis == inputType.getRank() - 1) {
result = buildSoftmax(input, axis, /*axis=*/0, rewriter, softmaxOp.getLoc());
}
else {
SmallVector<int64_t> permutation;
permutation.reserve(inputType.getRank());
for (int64_t dim = 0; dim < inputType.getRank(); ++dim)
if (dim != axis)
permutation.push_back(dim);
permutation.push_back(axis);
SmallVector<int64_t> inversePermutation(inputType.getRank());
for (auto [newIndex, oldIndex] : llvm::enumerate(permutation))
inversePermutation[oldIndex] = static_cast<int64_t>(newIndex);
auto transposedType = RankedTensorType::get(
permuteShape(inputType.getShape(), permutation), inputType.getElementType(), inputType.getEncoding());
auto preTransposeCompute =
createSpatCompute<1>(rewriter, softmaxOp.getLoc(), TypeRange {transposedType}, {}, input, [&](Value x) {
Value transposed = ONNXTransposeOp::create(
rewriter, softmaxOp.getLoc(), transposedType, x, rewriter.getI64ArrayAttr(permutation));
spatial::SpatYieldOp::create(rewriter, softmaxOp.getLoc(), transposed);
});
Value transposedInput = preTransposeCompute.getResult(0);
Value transposedResult = buildSoftmax(
transposedInput, /*softmaxAxis=*/inputType.getRank() - 1, /*axis=*/0, rewriter, softmaxOp.getLoc());
result = ONNXTransposeOp::create(
rewriter, softmaxOp.getLoc(), inputType, transposedResult, rewriter.getI64ArrayAttr(inversePermutation));
}
rewriter.replaceOp(softmaxOp, result);
return success();
}
};
} // namespace
void populateSoftmaxPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
patterns.add<SoftmaxToSpatialCompute>(ctx);
}
} // namespace onnx_mlir

View File

@@ -23,8 +23,6 @@ struct Concat : public OpConversionPattern<ONNXConcatOp> {
} }
}; };
void populateConcatPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { void populateConcatPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert<Concat>(ctx); }
patterns.insert<Concat>(ctx);
}
} // namespace onnx_mlir } // namespace onnx_mlir

View File

@@ -0,0 +1,157 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/SmallVector.h"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"
using namespace mlir;
namespace onnx_mlir {
namespace {
static int64_t normalizeAxis(int64_t axis, int64_t rank) { return axis >= 0 ? axis : rank + axis; }
static int64_t normalizeIndex(int64_t index, int64_t dimSize) { return index >= 0 ? index : dimSize + index; }
static Value
extractSliceAt(Value input, int64_t axis, int64_t offset, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
SmallVector<OpFoldResult> offsets(inputType.getRank(), rewriter.getIndexAttr(0));
SmallVector<OpFoldResult> sizes;
SmallVector<OpFoldResult> strides(inputType.getRank(), rewriter.getIndexAttr(1));
sizes.reserve(inputType.getRank());
for (int64_t dim : inputType.getShape())
sizes.push_back(rewriter.getIndexAttr(dim));
offsets[axis] = rewriter.getIndexAttr(offset);
sizes[axis] = rewriter.getIndexAttr(1);
return tensor::ExtractSliceOp::create(rewriter, loc, input, offsets, sizes, strides);
}
static Value concatGatherSlices(Value data,
int64_t axis,
ArrayRef<int64_t> indices,
int64_t axisDim,
ConversionPatternRewriter& rewriter,
Location loc) {
SmallVector<Value> slices;
slices.reserve(indices.size());
for (int64_t index : indices) {
int64_t normalizedIndex = normalizeIndex(index, axisDim);
if (normalizedIndex < 0 || normalizedIndex >= axisDim)
return {};
slices.push_back(extractSliceAt(data, axis, normalizedIndex, rewriter, loc));
}
if (slices.empty())
return {};
return slices.size() == 1 ? slices.front() : tensor::ConcatOp::create(rewriter, loc, axis, slices).getResult();
}
static Value addLeadingGatherDim(Value value, int64_t axis, ConversionPatternRewriter& rewriter, Location loc) {
auto valueType = cast<RankedTensorType>(value.getType());
SmallVector<int64_t> resultShape;
SmallVector<ReassociationIndices> reassociation;
resultShape.reserve(valueType.getRank() + 1);
reassociation.reserve(valueType.getRank());
int64_t resultDim = 0;
for (int64_t dim = 0; dim < valueType.getRank(); ++dim) {
if (dim == axis) {
resultShape.push_back(1);
resultShape.push_back(valueType.getShape()[dim]);
reassociation.push_back({static_cast<int64_t>(resultDim), static_cast<int64_t>(resultDim + 1)});
resultDim += 2;
continue;
}
resultShape.push_back(valueType.getShape()[dim]);
reassociation.push_back({static_cast<int64_t>(resultDim)});
resultDim++;
}
auto resultType = RankedTensorType::get(resultShape, valueType.getElementType(), valueType.getEncoding());
return tensor::ExpandShapeOp::create(rewriter, loc, resultType, value, reassociation);
}
struct Gather : OpConversionPattern<ONNXGatherOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult matchAndRewrite(ONNXGatherOp gatherOp,
ONNXGatherOpAdaptor adaptor,
ConversionPatternRewriter& rewriter) const override {
auto dataType = dyn_cast<RankedTensorType>(adaptor.getData().getType());
auto indicesType = dyn_cast<RankedTensorType>(adaptor.getIndices().getType());
if (!dataType || !indicesType || !dataType.hasStaticShape() || !indicesType.hasStaticShape())
return failure();
auto indicesConst = adaptor.getIndices().getDefiningOp<arith::ConstantOp>();
if (!indicesConst)
return failure();
auto indicesAttr = dyn_cast<DenseIntElementsAttr>(indicesConst.getValue());
if (!indicesAttr)
return failure();
int64_t rank = dataType.getRank();
int64_t axis = normalizeAxis(gatherOp.getAxis(), rank);
if (axis < 0 || axis >= rank)
return failure();
int64_t axisDim = dataType.getShape()[axis];
if (axisDim <= 0)
return failure();
SmallVector<int64_t> flatIndices(indicesAttr.getValues<int64_t>().begin(), indicesAttr.getValues<int64_t>().end());
Location loc = gatherOp.getLoc();
auto computeOp =
createSpatCompute<1>(rewriter,
loc,
TypeRange {gatherOp.getResult().getType()},
{},
adaptor.getData(),
[&](Value data) -> LogicalResult {
Value result;
if (indicesType.getRank() == 1) {
result = concatGatherSlices(data, axis, flatIndices, axisDim, rewriter, loc);
}
else if (indicesType.getRank() == 2) {
int64_t rowCount = indicesType.getShape()[0];
int64_t rowWidth = indicesType.getShape()[1];
SmallVector<Value> rows;
rows.reserve(rowCount);
for (int64_t row = 0; row < rowCount; ++row) {
ArrayRef<int64_t> rowIndices(flatIndices.data() + row * rowWidth, rowWidth);
Value gatheredRow = concatGatherSlices(data, axis, rowIndices, axisDim, rewriter, loc);
if (!gatheredRow)
return failure();
rows.push_back(addLeadingGatherDim(gatheredRow, axis, rewriter, loc));
}
result = rows.size() == 1
? rows.front()
: tensor::ConcatOp::create(rewriter, loc, /*axis=*/axis, rows).getResult();
}
else {
return failure();
}
if (!result)
return failure();
spatial::SpatYieldOp::create(rewriter, loc, result);
return success();
});
if (failed(computeOp))
return failure();
rewriter.replaceOp(gatherOp, computeOp->getResults());
return success();
}
};
} // namespace
void populateGatherPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.add<Gather>(ctx); }
} // namespace onnx_mlir

View File

@@ -0,0 +1,90 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/STLExtras.h"
#include <algorithm>
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"
using namespace mlir;
namespace onnx_mlir {
namespace {
static Value
extractSliceAt(Value input, int64_t axis, int64_t offset, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
SmallVector<OpFoldResult> offsets(inputType.getRank(), rewriter.getIndexAttr(0));
SmallVector<OpFoldResult> sizes;
SmallVector<OpFoldResult> strides(inputType.getRank(), rewriter.getIndexAttr(1));
sizes.reserve(inputType.getRank());
for (int64_t dim : inputType.getShape())
sizes.push_back(rewriter.getIndexAttr(dim));
offsets[axis] = rewriter.getIndexAttr(offset);
sizes[axis] = rewriter.getIndexAttr(1);
return tensor::ExtractSliceOp::create(rewriter, loc, input, offsets, sizes, strides);
}
static int64_t nearestAsymmetricIndex(int64_t outputIndex, int64_t inputDim, int64_t outputDim) {
return std::min<int64_t>((outputIndex * inputDim) / outputDim, inputDim - 1);
}
static Value buildNearestResize(Value input,
ArrayRef<int64_t> inputShape,
ArrayRef<int64_t> outputShape,
int64_t axis,
ConversionPatternRewriter& rewriter,
Location loc) {
if (axis == static_cast<int64_t>(outputShape.size()))
return input;
SmallVector<Value> slices;
slices.reserve(outputShape[axis]);
for (int64_t outputIndex = 0; outputIndex < outputShape[axis]; ++outputIndex) {
int64_t inputIndex = nearestAsymmetricIndex(outputIndex, inputShape[axis], outputShape[axis]);
Value slice = extractSliceAt(input, axis, inputIndex, rewriter, loc);
slices.push_back(buildNearestResize(slice, inputShape, outputShape, axis + 1, rewriter, loc));
}
return slices.size() == 1 ? slices.front() : tensor::ConcatOp::create(rewriter, loc, axis, slices).getResult();
}
struct Resize : OpConversionPattern<ONNXResizeOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult matchAndRewrite(ONNXResizeOp resizeOp,
ONNXResizeOpAdaptor adaptor,
ConversionPatternRewriter& rewriter) const override {
auto inputType = dyn_cast<RankedTensorType>(adaptor.getX().getType());
auto resultType = dyn_cast<RankedTensorType>(resizeOp.getY().getType());
if (!inputType || !resultType || !inputType.hasStaticShape() || !resultType.hasStaticShape())
return failure();
if (resizeOp.getMode() != "nearest" || resizeOp.getCoordinateTransformationMode() != "asymmetric"
|| resizeOp.getNearestMode() != "floor")
return failure();
if (llvm::any_of(inputType.getShape(), [](int64_t dim) { return dim <= 0; })
|| llvm::any_of(resultType.getShape(), [](int64_t dim) { return dim <= 0; }))
return failure();
auto computeOp =
createSpatCompute<1>(rewriter, resizeOp.getLoc(), TypeRange {resultType}, {}, adaptor.getX(), [&](Value x) {
Value result =
buildNearestResize(x, inputType.getShape(), resultType.getShape(), /*axis=*/0, rewriter, resizeOp.getLoc());
spatial::SpatYieldOp::create(rewriter, resizeOp.getLoc(), result);
});
rewriter.replaceOp(resizeOp, computeOp.getResults());
return success();
}
};
} // namespace
void populateResizePatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.add<Resize>(ctx); }
} // namespace onnx_mlir

View File

@@ -0,0 +1,70 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Transforms/DialectConversion.h"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"
using namespace mlir;
namespace onnx_mlir {
namespace {
static int64_t normalizeAxis(int64_t axis, int64_t rank) { return axis >= 0 ? axis : rank + axis; }
static Value extractSliceAt(
Value input, int64_t axis, int64_t offset, int64_t size, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
SmallVector<OpFoldResult> offsets(inputType.getRank(), rewriter.getIndexAttr(0));
SmallVector<OpFoldResult> sizes;
SmallVector<OpFoldResult> strides(inputType.getRank(), rewriter.getIndexAttr(1));
sizes.reserve(inputType.getRank());
for (int64_t dim : inputType.getShape())
sizes.push_back(rewriter.getIndexAttr(dim));
offsets[axis] = rewriter.getIndexAttr(offset);
sizes[axis] = rewriter.getIndexAttr(size);
return tensor::ExtractSliceOp::create(rewriter, loc, input, offsets, sizes, strides);
}
struct Split : OpConversionPattern<ONNXSplitOp> {
using OpConversionPattern::OpConversionPattern;
LogicalResult
matchAndRewrite(ONNXSplitOp splitOp, ONNXSplitOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const override {
auto inputType = dyn_cast<RankedTensorType>(adaptor.getInput().getType());
if (!inputType || !inputType.hasStaticShape())
return failure();
int64_t rank = inputType.getRank();
int64_t axis = normalizeAxis(splitOp.getAxis(), rank);
if (axis < 0 || axis >= rank)
return failure();
SmallVector<Value> outputs;
outputs.reserve(splitOp.getNumResults());
int64_t offset = 0;
for (Value result : splitOp.getResults()) {
auto resultType = dyn_cast<RankedTensorType>(result.getType());
if (!resultType || !resultType.hasStaticShape())
return failure();
int64_t sliceSize = resultType.getShape()[axis];
auto computeOp =
createSpatCompute<1>(rewriter, splitOp.getLoc(), TypeRange {resultType}, {}, adaptor.getInput(), [&](Value x) {
Value output = extractSliceAt(x, axis, offset, sliceSize, rewriter, splitOp.getLoc());
spatial::SpatYieldOp::create(rewriter, splitOp.getLoc(), output);
});
outputs.push_back(computeOp.getResult(0));
offset += sliceSize;
}
rewriter.replaceOp(splitOp, outputs);
return success();
}
};
} // namespace
void populateSplitPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.add<Split>(ctx); }
} // namespace onnx_mlir

View File

@@ -63,4 +63,10 @@ def spatToPimVSigm : Pat<
(NativeCodeCall<"onnx_mlir::getBestOutputTensorFromOperandsOrAllocate($_builder, $0.getDefiningOp())"> $srcOpRes)) (NativeCodeCall<"onnx_mlir::getBestOutputTensorFromOperandsOrAllocate($_builder, $0.getDefiningOp())"> $srcOpRes))
>; >;
def spatToPimVSoftmax : Pat<
(SpatSoftmaxOp:$srcOpRes $input),
(PimVSoftmaxOp $input,
(NativeCodeCall<"onnx_mlir::getBestOutputTensorFromOperandsOrAllocate($_builder, $0.getDefiningOp())"> $srcOpRes))
>;
#endif // SPATIAL_TO_PIM #endif // SPATIAL_TO_PIM

View File

@@ -618,17 +618,22 @@ void SpatialToPimPass::markOpToRemove(Operation* op) {
} }
void SpatialToPimPass::replaceReturnOpOperands(func::ReturnOp& returnOp, IRRewriter& rewriter) { void SpatialToPimPass::replaceReturnOpOperands(func::ReturnOp& returnOp, IRRewriter& rewriter) {
for (auto it : llvm::enumerate(returnOp.getOperands())) { SmallVector<Value> originalOperands(returnOp.getOperands().begin(), returnOp.getOperands().end());
Operation* returnOperand = it.value().getDefiningOp(); for (auto it : llvm::enumerate(originalOperands)) {
size_t orderWithinReturn = it.index(); size_t orderWithinReturn = it.index();
Operation* returnOperand = it.value().getDefiningOp();
rewriter.modifyOpInPlace(returnOp, rewriter.modifyOpInPlace(returnOp,
[&] { returnOp.setOperand(orderWithinReturn, outputTensors[orderWithinReturn]); }); [&] { returnOp.setOperand(orderWithinReturn, outputTensors[orderWithinReturn]); });
Operation* opToErase = returnOperand; Operation* opToErase = returnOperand;
while (opToErase) { while (opToErase) {
bool isExclusivelyOwnedByReturnChain = opToErase->use_empty() || opToErase->hasOneUse(); bool isExclusivelyOwnedByReturnChain = opToErase->use_empty();
if (!isExclusivelyOwnedByReturnChain && opToErase->hasOneUse()) {
Operation* onlyUser = *opToErase->getUsers().begin();
isExclusivelyOwnedByReturnChain =
isa<func::ReturnOp, tensor::ConcatOp>(onlyUser) || isChannelUseChainOp(onlyUser);
}
if (!isExclusivelyOwnedByReturnChain) if (!isExclusivelyOwnedByReturnChain)
break; break;

View File

@@ -455,4 +455,27 @@ def PimVSigmOp : PimOp<"vsigm", [DestinationStyleOpInterface]> {
}]; }];
} }
def PimVSoftmaxOp : PimOp<"vsoftmax", [DestinationStyleOpInterface]> {
let summary = "Softmax over the full input vector";
let arguments = (ins
PimTensor:$input,
PimTensor:$outputBuffer
);
let results = (outs
PimTensor:$output
);
let extraClassDeclaration = [{
mlir::MutableOperandRange getDpsInitsMutable() {
return getOutputBufferMutable();
}
}];
let assemblyFormat = [{
`(` $input `,` $outputBuffer `)` attr-dict `:` `(` type($input) `,` type($outputBuffer) `)` `->` type($output)
}];
}
#endif // PIM_DIALECT_H #endif // PIM_DIALECT_H

View File

@@ -273,6 +273,7 @@ void registerOpBufferizationInterfaces(DialectRegistry& registry) {
PimVReluOp::attachInterface<UnaryDstOpInterface<PimVReluOp>>(*ctx); PimVReluOp::attachInterface<UnaryDstOpInterface<PimVReluOp>>(*ctx);
PimVTanhOp::attachInterface<UnaryDstOpInterface<PimVTanhOp>>(*ctx); PimVTanhOp::attachInterface<UnaryDstOpInterface<PimVTanhOp>>(*ctx);
PimVSigmOp::attachInterface<UnaryDstOpInterface<PimVSigmOp>>(*ctx); PimVSigmOp::attachInterface<UnaryDstOpInterface<PimVSigmOp>>(*ctx);
PimVSoftmaxOp::attachInterface<UnaryDstOpInterface<PimVSoftmaxOp>>(*ctx);
}); });
} }

View File

@@ -485,7 +485,7 @@ DCPAnalysisResult GraphDCP::getResult() {
size_t i = 0; size_t i = 0;
for (auto node : nodes) { for (auto node : nodes) {
ret.computeToCPUMap[node->getSpatWeightedCompute()] = cpu; ret.computeToCPUMap[node->getSpatWeightedCompute()] = cpu;
if (i++ == nodes.size() - 1){ if (i++ == nodes.size() - 1) {
ret.isLastComputeOfACpu.insert(node->getSpatWeightedCompute()); ret.isLastComputeOfACpu.insert(node->getSpatWeightedCompute());
ret.cpuToLastComputeMap[cpu] = node->getSpatWeightedCompute(); ret.cpuToLastComputeMap[cpu] = node->getSpatWeightedCompute();
} }

View File

@@ -43,7 +43,5 @@ bool TaskDCP::hasDescendent(TaskDCP* child) {
return false; return false;
} }
//TODO fare qualcosa di sensato // TODO fare qualcosa di sensato
int TaskDCP::computeWeight(GraphDCP* graph, CPU cpu) { int TaskDCP::computeWeight(GraphDCP* graph, CPU cpu) { return orig_weight; }
return orig_weight;
}

View File

@@ -75,11 +75,11 @@ public:
alst = val; alst = val;
} }
bool hasDescendent(TaskDCP* child); bool hasDescendent(TaskDCP* child);
int64_t Id() const { return (int64_t)spatWeightedCompute.getAsOpaquePointer(); } int64_t Id() const { return (int64_t) spatWeightedCompute.getAsOpaquePointer(); }
bool isCP() const { return alst == aest; } bool isCP() const { return alst == aest; }
bool isScheduled() const { return scheduledCPU.has_value(); } bool isScheduled() const { return scheduledCPU.has_value(); }
onnx_mlir::spatial::SpatWeightedCompute getSpatWeightedCompute(){return spatWeightedCompute;} onnx_mlir::spatial::SpatWeightedCompute getSpatWeightedCompute() { return spatWeightedCompute; }
friend std::optional<Edge_pair> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight); friend std::optional<Edge_pair> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
friend void removeEdge(TaskDCP* parent, TaskDCP* child); friend void removeEdge(TaskDCP* parent, TaskDCP* child);

View File

@@ -71,12 +71,7 @@ public:
return true; return true;
} }
auto begin() { auto begin() { return storage.begin(); }
return storage.begin();
}
auto end() {
return storage.end();
}
auto end() { return storage.end(); }
}; };

View File

@@ -1,7 +1,9 @@
#pragma once #pragma once
#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypeInterfaces.h"
#include "llvm/Support/Casting.h" #include "llvm/Support/Casting.h"
#include <algorithm> #include <algorithm>
#include <cstdint> #include <cstdint>
#include <utility> #include <utility>
@@ -50,10 +52,9 @@ inline int64_t getSpatWeightCompute(onnx_mlir::spatial::SpatWeightedCompute spat
int64_t tot = 0; int64_t tot = 0;
for (auto& region : spatWeightedCompute.getBody()) { for (auto& region : spatWeightedCompute.getBody()) {
for (auto& inst : region) { for (auto& inst : region) {
for(auto result : inst.getResults()){ for (auto result : inst.getResults())
if(auto element = llvm::dyn_cast<mlir::ShapedType>(result.getType())) if (auto element = llvm::dyn_cast<mlir::ShapedType>(result.getType()))
tot += onnx_mlir::getSizeInBytes(element); tot += onnx_mlir::getSizeInBytes(element);
}
} }
} }
return tot; return tot;

View File

@@ -272,6 +272,22 @@ def SpatSigmoidOp : SpatOp<"sigmoid", []> {
}]; }];
} }
def SpatSoftmaxOp : SpatOp<"softmax", []> {
let summary = "Softmax over the full input tensor slice";
let arguments = (ins
SpatTensor:$input
);
let results = (outs
SpatTensor:$output
);
let assemblyFormat = [{
`(` $input `)` attr-dict `:` type($input) `->` type($output)
}];
}
def SpatReluOp : SpatOp<"relu", []> { def SpatReluOp : SpatOp<"relu", []> {
let summary = "Element-wise ReLU activation"; let summary = "Element-wise ReLU activation";

View File

@@ -202,9 +202,9 @@ private:
rewriter.clone(op, mapper); rewriter.clone(op, mapper);
} }
for (auto users : oldWeightedCompute->getUsers()) for (auto& use : llvm::make_early_inc_range(oldWeightedCompute->getUses()))
if (auto funcRet = dyn_cast<func::ReturnOp>(users)) if (isa<func::ReturnOp>(use.getOwner()))
funcRet.setOperand(0, newWeightedCompute.getResult(0)); use.assign(newWeightedCompute.getResult(0));
oldToNewComputeMap.insert({oldWeightedCompute, newWeightedCompute}); oldToNewComputeMap.insert({oldWeightedCompute, newWeightedCompute});
return {cast<SpatWeightedCompute>(newWeightedCompute), computeValueResults}; return {cast<SpatWeightedCompute>(newWeightedCompute), computeValueResults};

View File

@@ -413,7 +413,7 @@ struct ChannelBroadcastReceiveOpInterface
outputTensor, outputTensor,
rewriter.getI32IntegerAttr(numElements * elementSize), rewriter.getI32IntegerAttr(numElements * elementSize),
rewriter.getI32IntegerAttr(srcCoreId.value())) rewriter.getI32IntegerAttr(srcCoreId.value()))
.getOutput(); .getOutput();
replaceOpWithBufferizedValues(rewriter, op, newValue); replaceOpWithBufferizedValues(rewriter, op, newValue);

View File

@@ -146,6 +146,37 @@ struct RewriteHostSubviewLoadPattern final : OpRewritePattern<pim::PimMemCopyHos
} }
}; };
struct RewriteHostSubviewStorePattern final : OpRewritePattern<pim::PimMemCopyDevToHostOp> {
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(pim::PimMemCopyDevToHostOp copyOp, PatternRewriter& rewriter) const override {
auto status = rewriteSubviewCopyLikeOp(
copyOp,
copyOp.getHostTarget(),
copyOp.getDeviceSource(),
copyOp.getHostTargetOffset(),
copyOp.getDeviceSourceOffset(),
copyOp.getSize(),
rewriter,
[&](
MemRefType resultType, Value dst, Value src, int64_t dstByteOffset, int64_t srcByteOffset, int64_t sliceBytes) {
pim::PimMemCopyDevToHostOp::create(rewriter,
copyOp.getLoc(),
resultType,
dst,
src,
rewriter.getI32IntegerAttr(static_cast<int32_t>(dstByteOffset)),
rewriter.getI32IntegerAttr(static_cast<int32_t>(srcByteOffset)),
rewriter.getI32IntegerAttr(static_cast<int32_t>(sliceBytes)));
});
if (failed(status))
return failure();
rewriter.replaceOp(copyOp, copyOp.getHostTarget());
return success();
}
};
struct FoldConstantCoreSubviewPattern final : OpRewritePattern<memref::SubViewOp> { struct FoldConstantCoreSubviewPattern final : OpRewritePattern<memref::SubViewOp> {
using OpRewritePattern::OpRewritePattern; using OpRewritePattern::OpRewritePattern;
@@ -209,8 +240,10 @@ struct FoldConstantCoreSubviewPattern final : OpRewritePattern<memref::SubViewOp
} // namespace } // namespace
void populateConstantFoldingSubviewPatterns(RewritePatternSet& patterns) { void populateConstantFoldingSubviewPatterns(RewritePatternSet& patterns) {
patterns.add<RewriteCoreSubviewCopyPattern, RewriteHostSubviewLoadPattern, FoldConstantCoreSubviewPattern>( patterns.add<RewriteCoreSubviewCopyPattern,
patterns.getContext()); RewriteHostSubviewLoadPattern,
RewriteHostSubviewStorePattern,
FoldConstantCoreSubviewPattern>(patterns.getContext());
} }
} // namespace onnx_mlir } // namespace onnx_mlir

View File

@@ -85,6 +85,36 @@ python3 validation/operations/gen_tests.py
| 4D | `sigmoid/4d` | [2,3,4,4] | [2,3,4,4] | Standalone NCHW Sigmoid | | 4D | `sigmoid/4d` | [2,3,4,4] | [2,3,4,4] | Standalone NCHW Sigmoid |
| After Gemm | `sigmoid/after_gemm` | [4,64] | [4,32] | Gemm + bias, then Sigmoid | | After Gemm | `sigmoid/after_gemm` | [4,64] | [4,32] | Gemm + bias, then Sigmoid |
## Softmax
| Test | Directory | Input | Output | Axis | Notes |
|--------------|--------------------------|-------------|-------------|------|---------------------------------|
| Basic | `softmax/basic` | [3,5] | [3,5] | 1 | Row-wise softmax over features |
| 3D last axis | `softmax/3d_last_axis` | [2,3,4] | [2,3,4] | 2 | Last-dimension normalization |
| Channel axis | `softmax/channel_axis` | [1,3,2,2] | [1,3,2,2] | 1 | NCHW channel-wise softmax |
## Resize
| Test | Directory | Input | Output | Mode | Notes |
|---------------------|-------------------------|-----------|-----------|---------|-----------------------------------------|
| Nearest 2x | `resize/nearest_2x` | [1,1,2,3] | [1,1,4,6] | nearest | NCHW upsampling with scales [1,1,2,2] |
| Non-uniform scales | `resize/non_uniform` | [1,1,2,3] | [1,1,6,6] | nearest | Different height/width scaling factors |
| Explicit sizes | `resize/with_sizes` | [1,1,2,3] | [1,1,3,5] | nearest | Sizes input used instead of scales |
## Split
| Test | Directory | Input | Outputs | Axis | Notes |
|-----------------|---------------------------|-------|----------------------|------|-------------------------------------|
| Basic | `split/basic` | [2,6] | [2,2], [2,4] | 1 | Two-way split with explicit sizes |
| Equal three-way | `split/equal_three_way` | [2,6] | [2,2], [2,2], [2,2] | 1 | Optional split input omitted |
## Gather
| Test | Directory | Input | Indices | Output | Axis | Notes |
|----------------------|--------------------------------|-------|---------|----------|------|--------------------------------|
| Axis 1 | `gather/axis1` | [3,4] | [2] | [3,2] | 1 | Select two columns |
| Axis 0 matrix indices| `gather/axis0_matrix_indices` | [4,3] | [2,2] | [2,2,3] | 0 | Gather rows with 2D indices |
## Add ## Add
| Test | Directory | Input(s) | Output | Notes | | Test | Directory | Input(s) | Output | Notes |

Binary file not shown.

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Generate ONNX test models for validating GEMM, Conv, Pooling, Relu, and ReduceMean implementations.""" """Generate ONNX test models for validating supported ONNX operations."""
import numpy as np import numpy as np
import onnx import onnx
@@ -473,6 +473,140 @@ def sigmoid_after_gemm():
save_model(model, "sigmoid/after_gemm", "sigmoid_after_gemm.onnx") save_model(model, "sigmoid/after_gemm", "sigmoid_after_gemm.onnx")
# ---------------------------------------------------------------------------
# Softmax tests
# ---------------------------------------------------------------------------
def softmax_basic():
"""Softmax over the last dimension of a 2D tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [3, 5])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [3, 5])
node = helper.make_node("Softmax", ["X"], ["Y"], axis=1)
graph = helper.make_graph([node], "softmax_basic", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "softmax/basic", "softmax_basic.onnx")
def softmax_3d_last_axis():
"""Softmax over the last axis of a 3D tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3, 4])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 3, 4])
node = helper.make_node("Softmax", ["X"], ["Y"], axis=2)
graph = helper.make_graph([node], "softmax_3d_last_axis", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "softmax/3d_last_axis", "softmax_3d_last_axis.onnx")
def softmax_channel_axis():
"""Softmax over the channel axis of an NCHW tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 3, 2, 2])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 3, 2, 2])
node = helper.make_node("Softmax", ["X"], ["Y"], axis=1)
graph = helper.make_graph([node], "softmax_channel_axis", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "softmax/channel_axis", "softmax_channel_axis.onnx")
# ---------------------------------------------------------------------------
# Resize tests
# ---------------------------------------------------------------------------
def resize_nearest_2x():
"""Resize an NCHW tensor with nearest-neighbor upsampling by a factor of 2."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 1, 2, 3])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 1, 4, 6])
roi = numpy_helper.from_array(np.asarray([], dtype=np.float32), name="roi")
scales = numpy_helper.from_array(np.asarray([1.0, 1.0, 2.0, 2.0], dtype=np.float32), name="scales")
node = helper.make_node(
"Resize", ["X", "roi", "scales"], ["Y"],
mode="nearest", coordinate_transformation_mode="asymmetric", nearest_mode="floor")
graph = helper.make_graph([node], "resize_nearest_2x", [X], [Y], initializer=[roi, scales])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "resize/nearest_2x", "resize_nearest_2x.onnx")
def resize_nearest_non_uniform():
"""Resize an NCHW tensor with non-uniform nearest-neighbor scales."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 1, 2, 3])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 1, 6, 6])
roi = numpy_helper.from_array(np.asarray([], dtype=np.float32), name="roi")
scales = numpy_helper.from_array(np.asarray([1.0, 1.0, 3.0, 2.0], dtype=np.float32), name="scales")
node = helper.make_node(
"Resize", ["X", "roi", "scales"], ["Y"],
mode="nearest", coordinate_transformation_mode="asymmetric", nearest_mode="floor")
graph = helper.make_graph([node], "resize_nearest_non_uniform", [X], [Y], initializer=[roi, scales])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "resize/non_uniform", "resize_non_uniform.onnx")
def resize_with_sizes():
"""Resize an NCHW tensor to explicit output sizes."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 1, 2, 3])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 1, 3, 5])
roi = numpy_helper.from_array(np.asarray([], dtype=np.float32), name="roi")
sizes = make_int64_initializer("sizes", [1, 1, 3, 5])
node = helper.make_node(
"Resize", ["X", "roi", "", "sizes"], ["Y"],
mode="nearest", coordinate_transformation_mode="asymmetric", nearest_mode="floor")
graph = helper.make_graph([node], "resize_with_sizes", [X], [Y], initializer=[roi, sizes])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "resize/with_sizes", "resize_with_sizes.onnx")
# ---------------------------------------------------------------------------
# Split tests
# ---------------------------------------------------------------------------
def split_basic():
"""Split a 2D tensor into two outputs along the feature axis."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 6])
Y0 = helper.make_tensor_value_info("Y0", TensorProto.FLOAT, [2, 2])
Y1 = helper.make_tensor_value_info("Y1", TensorProto.FLOAT, [2, 4])
split = make_int64_initializer("split", [2, 4])
node = helper.make_node("Split", ["X", "split"], ["Y0", "Y1"], axis=1)
graph = helper.make_graph([node], "split_basic", [X], [Y0, Y1], initializer=[split])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "split/basic", "split_basic.onnx")
def split_equal_three_way():
"""Split a 2D tensor evenly into three outputs."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 6])
Y0 = helper.make_tensor_value_info("Y0", TensorProto.FLOAT, [2, 2])
Y1 = helper.make_tensor_value_info("Y1", TensorProto.FLOAT, [2, 2])
Y2 = helper.make_tensor_value_info("Y2", TensorProto.FLOAT, [2, 2])
node = helper.make_node("Split", ["X"], ["Y0", "Y1", "Y2"], axis=1)
graph = helper.make_graph([node], "split_equal_three_way", [X], [Y0, Y1, Y2])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "split/equal_three_way", "split_equal_three_way.onnx")
# ---------------------------------------------------------------------------
# Gather tests
# ---------------------------------------------------------------------------
def gather_axis1():
"""Gather selected columns from a 2D tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [3, 4])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [3, 2])
indices = make_int64_initializer("indices", [0, 2])
node = helper.make_node("Gather", ["X", "indices"], ["Y"], axis=1)
graph = helper.make_graph([node], "gather_axis1", [X], [Y], initializer=[indices])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "gather/axis1", "gather_axis1.onnx")
def gather_axis0_matrix_indices():
"""Gather rows using a 2D indices tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [4, 3])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 2, 3])
indices = make_int64_initializer("indices", [[0, 2], [3, 1]])
node = helper.make_node("Gather", ["X", "indices"], ["Y"], axis=0)
graph = helper.make_graph([node], "gather_axis0_matrix_indices", [X], [Y], initializer=[indices])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "gather/axis0_matrix_indices", "gather_axis0_matrix_indices.onnx")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Add tests # Add tests
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -599,55 +733,6 @@ def div_after_gemm():
save_model(model, "div/after_gemm", "div_after_gemm.onnx") save_model(model, "div/after_gemm", "div_after_gemm.onnx")
# ---------------------------------------------------------------------------
# ReduceMean tests
# ---------------------------------------------------------------------------
def reducemean_basic():
"""ReduceMean over the feature dimension, preserving rank."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [4, 8])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [4, 1])
node = helper.make_node("ReduceMean", ["X"], ["Y"], axes=[1], keepdims=1)
graph = helper.make_graph([node], "reducemean_basic", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "reduce_mean/basic", "reduce_mean_basic.onnx")
def reducemean_keepdims_0():
"""ReduceMean over the feature dimension, dropping the reduced axis."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [4, 8])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [4])
node = helper.make_node("ReduceMean", ["X"], ["Y"], axes=[1], keepdims=0)
graph = helper.make_graph([node], "reducemean_keepdims_0", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "reduce_mean/keepdims_0", "reduce_mean_keepdims_0.onnx")
def reducemean_4d_spatial():
"""ReduceMean over H and W on an NCHW tensor."""
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 3, 4, 4])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 3, 1, 1])
node = helper.make_node("ReduceMean", ["X"], ["Y"], axes=[2, 3], keepdims=1)
graph = helper.make_graph([node], "reducemean_4d_spatial", [X], [Y])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "reduce_mean/4d_spatial", "reduce_mean_4d_spatial.onnx")
def reducemean_after_conv():
"""Conv followed by ReduceMean over the spatial dimensions."""
rng = np.random.default_rng(62)
X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 3, 5, 5])
Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 2, 1, 1])
W = numpy_helper.from_array(rng.uniform(-1, 1, (2, 3, 3, 3)).astype(np.float32), name="W")
B = numpy_helper.from_array(rng.uniform(-1, 1, (2,)).astype(np.float32), name="B")
conv = helper.make_node("Conv", ["X", "W", "B"], ["C"],
kernel_shape=[3, 3], strides=[1, 1], pads=[0, 0, 0, 0])
reduce = helper.make_node("ReduceMean", ["C"], ["Y"], axes=[2, 3], keepdims=1)
graph = helper.make_graph([conv, reduce], "reducemean_after_conv", [X], [Y], initializer=[W, B])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
save_model(model, "reduce_mean/after_conv", "reduce_mean_after_conv.onnx")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Main # Main
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -699,6 +784,24 @@ if __name__ == "__main__":
sigmoid_4d() sigmoid_4d()
sigmoid_after_gemm() sigmoid_after_gemm()
print("\nGenerating Split tests:")
split_basic()
split_equal_three_way()
print("\nGenerating Softmax tests:")
softmax_basic()
softmax_3d_last_axis()
softmax_channel_axis()
print("\nGenerating Resize tests:")
resize_nearest_2x()
resize_nearest_non_uniform()
resize_with_sizes()
print("\nGenerating Gather tests:")
gather_axis1()
gather_axis0_matrix_indices()
print("\nGenerating Add tests:") print("\nGenerating Add tests:")
add_basic() add_basic()
add_broadcast_row() add_broadcast_row()

Binary file not shown.

Binary file not shown.