Skip to content

Commit

Permalink
Generate non-coherent cache loads and noalias for CUDA (iree-org#…
Browse files Browse the repository at this point in the history
…11494)

IREE can recognize `readonly` data. Also, all the data buffers will 
not aliased with data in the same region/kernel.

This PR makes advantage of this information to generates the
`llvm.noalias` and `llvm.readonly` attributes to CUDA kernel function
parameters. If an argument is not readonly, it still set `llvm.noalias`
because the ranges used within the bindings are guaranteed not to alias.

This is crucial for two reasons. When the downstream compiler detects
that it can better schedule the load instructions. It typically
schedules them together that utilized bandwith better. Second, it
generates 'non-coherent cache loads' (`nc` prefix to `ld.global`
instruction in PTX). What PTX models say about it is as follows (this
part gives room for experimentation):

```
The texture cache is larger, has higher bandwidth, and longer latency than the global memory cache.
For applications with sufficient parallelism to cover the longer latency, ld.global.nc should offer 
better performance than ld.global.
```

This PR first extends `hal.interface.binding.subspan` with `readonly`
argument.
```
hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c0) alignment(64) flags(ReadOnly) : memref<16xf32>
```

In convert nvvm pass, it leverages `flags(ReadOnly)` to decide
`llvm.readonly` attribute on the function argument. It also marks
`llvm.noalias` every other argument.

```
llvm.func @foo(%arg0: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly})
```
  • Loading branch information
grypp authored Jan 19, 2023
1 parent 3682a4c commit a922768
Show file tree
Hide file tree
Showing 14 changed files with 136 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct FoldReshapeIntoInterfaceTensorLoad : OpRewritePattern<TensorReshapeOp> {
subspanOp.getLoc(), newSubspanType, subspanOp.getSet(),
subspanOp.getBinding(), subspanOp.getDescriptorType(),
subspanOp.getByteOffset(), subspanOp.getDynamicDims(),
subspanOp.getAlignmentAttr());
subspanOp.getAlignmentAttr(), subspanOp.getDescriptorFlagsAttr());

rewriter.replaceOpWithNewOp<IREE::Flow::DispatchTensorLoadOp>(
reshapeOp, reshapeOp.getResultType(), newSubspanOp,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ struct FlattenBindingSubspan final
auto newOp = rewriter.create<IREE::HAL::InterfaceBindingSubspanOp>(
subspanOp.getLoc(), newType, subspanOp.getSet(), subspanOp.getBinding(),
subspanOp.getDescriptorType(), subspanOp.getByteOffset(), dynamicDims,
subspanOp.getAlignmentAttr());
subspanOp.getAlignmentAttr(), subspanOp.getDescriptorFlagsAttr());
if (isRankOneMemRef(oldType)) {
rewriter.replaceOpWithNewOp<memref::CastOp>(subspanOp, oldType, newOp);
} else {
Expand Down Expand Up @@ -661,7 +661,7 @@ struct FoldSubspanOffsetIntoLoadStore final : public OpRewritePattern<OpType> {
Value newSubspan = rewriter.create<IREE::HAL::InterfaceBindingSubspanOp>(
memref.getLoc(), subspanOp.getType(), subspanOp.getSet(),
subspanOp.getBinding(), subspanOp.getDescriptorType(), zero,
subspanOp.getDynamicDims(), subspanOp.getAlignmentAttr());
subspanOp.getDynamicDims(), subspanOp.getAlignmentAttr(), nullptr);
rewriter.restoreInsertionPoint(ip);

MLIRContext *context = rewriter.getContext();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ struct MaterializeInterfaceBindingEncoding
rewriter.replaceOpWithNewOp<IREE::HAL::InterfaceBindingSubspanOp>(
subspanOp, newResultType, subspanOp.getSet(), subspanOp.getBinding(),
subspanOp.getDescriptorType(), subspanOp.getByteOffset(),
convertedDynamicDims.value(), subspanOp.getAlignmentAttr());
convertedDynamicDims.value(), subspanOp.getAlignmentAttr(),
subspanOp.getDescriptorFlagsAttr());
return success();
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ static Value findOrCreateSubspanBuffer(
subspanOp->getLoc(), memRefType, subspanOp.getSet(),
subspanOp.getBinding(), subspanOp.getDescriptorType(),
subspanOp.getByteOffset(), subspanOp.getDynamicDims(),
subspanOp.getAlignmentAttr());
subspanOp.getAlignmentAttr(), subspanOp.getDescriptorFlagsAttr());
if (subspanOp.getAlignment()) {
b.create<memref::AssumeAlignmentOp>(
subspanOp->getLoc(), buffer, subspanOp.getAlignment()->getZExtValue());
Expand Down
32 changes: 32 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,26 @@ class ConvertIREEBindingSubspanOp : public ConvertToLLVMPattern {
: ConvertToLLVMPattern(
IREE::HAL::InterfaceBindingSubspanOp::getOperationName(), context,
converter) {}

/// Checks all subspanOps with the same binding has readonly attribute
static bool checkAllSubspansReadonly(LLVM::LLVMFuncOp llvmFuncOp,
APInt binding) {
bool allReadOnly = false;
llvmFuncOp.walk([&](IREE::HAL::InterfaceBindingSubspanOp op) {
if (op.getBinding() == binding) {
if (!bitEnumContainsAny(op.getDescriptorFlags().value_or(
IREE::HAL::DescriptorFlags::None),
IREE::HAL::DescriptorFlags::ReadOnly)) {
allReadOnly = false;
return WalkResult::interrupt();
}
allReadOnly = true;
}
return WalkResult::advance();
});
return allReadOnly;
}

LogicalResult matchAndRewrite(
Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
Expand All @@ -324,6 +344,18 @@ class ConvertIREEBindingSubspanOp : public ConvertToLLVMPattern {
llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
LLVM::LLVMDialect::getAlignAttrName(),
rewriter.getI32IntegerAttr(16));
// It is safe to set the noalias attribute as it is guaranteed that the
// ranges within bindings won't alias.
llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
LLVM::LLVMDialect::getNoAliasAttrName(),
rewriter.getUnitAttr());
if (checkAllSubspansReadonly(llvmFuncOp, subspanOp.getBinding())) {
// Setting the readonly attribute here will generate non-coherent cache
// loads.
llvmFuncOp.setArgAttr(llvmBufferArg.getArgNumber(),
LLVM::LLVMDialect::getReadonlyAttrName(),
rewriter.getUnitAttr());
}
// Add the byte offset.
Value llvmBufferBasei8Ptr = rewriter.create<LLVM::BitcastOp>(
loc,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ hal.executable @abs_ex_dispatch_0 {
func.func @abs_ex_dispatch_0() {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%0 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) offset(%c128) : memref<16xf32>
%0 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32>
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<16xi32>
%2 = hal.interface.binding.subspan set(1) binding(2) type(storage_buffer) : memref<16xf32>
%3 = gpu.block_id x
Expand All @@ -36,9 +36,9 @@ hal.executable @abs_ex_dispatch_0 {
}
}
// CHECK-LABEL: llvm.func @abs_ex_dispatch_0
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32})
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias})
// CHECK: %[[C128:.+]] = llvm.mlir.constant(128 : index) : i64
// CHECK: %[[PTRI8:.+]] = llvm.bitcast %[[ARG1]] : !llvm.ptr<f32> to !llvm.ptr<i8>
// CHECK: %[[OFF:.+]] = llvm.getelementptr %[[PTRI8]][%[[C128]]] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
Expand Down Expand Up @@ -86,8 +86,8 @@ hal.executable @abs_dynamic {
}
}
// CHECK-LABEL: llvm.func @abs_dynamic
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32}, %[[ARG2:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32},
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias}, %[[ARG2:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %[[ARG3:.+]]: i32, %[[ARG4:.+]]: i32)
// CHECK: %[[C128:.+]] = llvm.mlir.constant(128 : index) : i64
// CHECK: %{{.*}} = llvm.zext %[[ARG4]] : i32 to i64
Expand Down Expand Up @@ -135,8 +135,8 @@ hal.executable @dead_symbol {
}
}
// CHECK-LABEL: llvm.func @dead_symbol
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32})
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %[[ARG1:.+]]: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias})
// CHECK: llvm.fadd

// -----
Expand Down Expand Up @@ -176,8 +176,8 @@ hal.executable @mixed_type {
}

// CHECK-LABEL: llvm.func @mixed_type
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32})
// CHECK-SAME: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias})
// CHECK: %[[C128:.+]] = llvm.mlir.constant(128 : index) : i64
// CHECK: %[[PTRI8:.+]] = llvm.bitcast %[[ARG0]] : !llvm.ptr<i32> to !llvm.ptr<i8>
// CHECK: %[[OFF:.+]] = llvm.getelementptr %[[PTRI8]][%[[C128]]] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
Expand Down Expand Up @@ -292,3 +292,48 @@ hal.executable @shared_memory_lowering_aligned_alloc {
// CHECK-NEXT: %{{.*}} = llvm.mlir.constant(4 : i64) : i64
// CHECK-NEXT: %{{.*}} = llvm.getelementptr %{{.*}} : (!llvm.ptr<array<0 x i8>, 3>, i64, i64) -> !llvm.ptr<array<0 x i8>, 3>
// CHECK-NEXT: %{{.*}} = llvm.bitcast %{{.*}} : !llvm.ptr<array<0 x i8>, 3> to !llvm.ptr<array<32 x f32>, 3>


// -----

#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<4, storage_buffer>
]>,
#hal.descriptor_set.layout<1, bindings = [
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
hal.executable @check_not_readonly {
hal.executable.variant @cuda, target = <"cuda", "cuda-nvptx-fb"> {
hal.executable.export @check_not_readonly layout(#pipeline_layout)
builtin.module {
func.func @check_not_readonly() {
%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index
%1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<16xi32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32>
%b11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) flags(ReadOnly) : memref<16xi32>
%b12 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c128) : memref<16xf32>
%b21 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) flags(ReadOnly) : memref<16xi32>
%b22 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c128) flags(ReadOnly) : memref<16xf32>
%2 = hal.interface.binding.subspan set(1) binding(3) type(storage_buffer) : memref<16xf32>
%3 = gpu.block_id x
%4 = gpu.block_dim x
%5 = gpu.thread_id x
%6 = arith.muli %3, %4 : index
%7 = arith.addi %6, %5 : index
%9 = memref.load %0[%7] : memref<16xf32>
%10 = memref.load %1[%7] : memref<16xi32>
%11 = arith.sitofp %10 : i32 to f32
%12 = arith.addf %9, %11 : f32
memref.store %12, %2[%7] : memref<16xf32>
return
}
}
}
}
// CHECK-LABEL: llvm.func @check_not_readonly
// CHECK-NOT: (%[[ARG0:.+]]: !llvm.ptr<i32> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly},

Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ hal.executable @abs_ex_dispatch_0 {
builtin.module {
func.func @abs_ex_dispatch_0() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : memref<16xf32>
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) flags(ReadOnly) : memref<16xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : memref<16xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : memref<16xf32>
%3 = gpu.block_id x
Expand All @@ -34,7 +34,7 @@ hal.executable @abs_ex_dispatch_0 {
}
}
// CHECK-LABEL: llvm.func @abs_ex_dispatch_0
// CHECK-SAME: (%{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32}, %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32})
// CHECK-SAME: (%{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias, llvm.readonly}, %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias},
// CHECK-SAME: %{{.*}}: !llvm.ptr<f32> {llvm.align = 16 : i32, llvm.noalias})
// CHECK: rocdl.workgroup.dim.x
// CHECK: llvm.fadd
3 changes: 2 additions & 1 deletion compiler/src/iree/compiler/Codegen/SPIRV/SPIRVEmulateI64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ struct ConvertHalInterfaceBindingSubspan final
rewriter.replaceOpWithNewOp<IREE::HAL::InterfaceBindingSubspanOp>(
op, newResultTy, adaptor.getSet(), adaptor.getBinding(),
adaptor.getDescriptorType(), adaptor.getByteOffset(),
adaptor.getDynamicDims(), adaptor.getAlignmentAttr());
adaptor.getDynamicDims(), adaptor.getAlignmentAttr(),
adaptor.getDescriptorFlagsAttr());
LLVM_DEBUG(llvm::dbgs()
<< "WideIntegerEmulation: new op: " << newOp << "\n");
(void)newOp;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,8 @@ class ProcessInterfaceBindingSubspan final
rewriter.replaceOpWithNewOp<IREE::HAL::InterfaceBindingSubspanOp>(
subspanOp, *vecMemRef, subspanOp.getSet(), subspanOp.getBinding(),
subspanOp.getDescriptorType(), subspanOp.getByteOffset(),
subspanOp.getDynamicDims(), subspanOp.getAlignmentAttr());
subspanOp.getDynamicDims(), subspanOp.getAlignmentAttr(),
subspanOp.getDescriptorFlagsAttr());
return success();
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ class WGSLReplacePushConstantsPass
/*set=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BIND_GROUP_INDEX),
/*binding=*/APInt(64, IREE_HAL_WEBGPU_PARAMS_BINDING_INDEX),
IREE::HAL::DescriptorType::UniformBuffer,
/*byte_offset=*/maxConstantValue, dynamicDims, alignmentAttr);
/*byte_offset=*/maxConstantValue, dynamicDims, alignmentAttr, nullptr);

// flow.dispatch.tensor.load -> tensor<Nxvector<4xi32>>
auto tensorType =
Expand Down
13 changes: 13 additions & 0 deletions compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,19 @@ void ExecutableLookupOp::getAsmResultNames(
//===----------------------------------------------------------------------===//
// hal.interface.binding.subspan
//===----------------------------------------------------------------------===//
void InterfaceBindingSubspanOp::build(
OpBuilder &builder, OperationState &result, Type resultType, APInt set,
APInt binding, IREE::HAL::DescriptorType descriptor_type, Value byte_offset,
ValueRange dynamic_dims, IntegerAttr alignment,
Optional<DescriptorFlags> flags) {
IREE::HAL::DescriptorFlagsAttr descriptorAttr;
if (flags.has_value()) {
descriptorAttr = IREE::HAL::DescriptorFlagsAttr::get(builder.getContext(),
flags.value());
}
build(builder, result, resultType, set, binding, descriptor_type, byte_offset,
dynamic_dims, alignment, descriptorAttr);
}

LogicalResult InterfaceBindingSubspanOp::verify() {
InterfaceBindingSubspanOp op = *this;
Expand Down
19 changes: 18 additions & 1 deletion compiler/src/iree/compiler/Dialect/HAL/IR/HALOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -2191,8 +2191,24 @@ def HAL_InterfaceBindingSubspanOp : HAL_Op<"interface.binding.subspan", [
HAL_DescriptorTypeAttr:$descriptor_type,
Optional<HAL_DeviceSize>:$byte_offset,
HAL_ShapeDynamicDims:$dynamic_dims,
OptionalAttr<IndexAttr>:$alignment
OptionalAttr<IndexAttr>:$alignment,
OptionalAttr<HAL_DescriptorFlagsAttr>:$descriptor_flags
);

let builders = [
OpBuilder<(ins
"Type":$resultType,
"APInt":$set,
"APInt":$binding,
"IREE::HAL::DescriptorType":$descriptor_type,
"Value":$byte_offset,
"ValueRange":$dynamic_dims,
"IntegerAttr":$alignment,
CArg<"mlir::Optional<DescriptorFlags>", "llvm::None">:$flags
)>,
];


let results = (outs
Res<AnyType, "", [MemAlloc]>:$result
);
Expand All @@ -2203,6 +2219,7 @@ def HAL_InterfaceBindingSubspanOp : HAL_Op<"interface.binding.subspan", [
`type` `(` custom<DescriptorType>($descriptor_type) `)`
(`offset` `(` $byte_offset^ `)`)?
(`alignment` `(` $alignment^ `)`)?
(`flags` `(` $descriptor_flags^ `)`)?
attr-dict `:` type($result) (`{` $dynamic_dims^ `}`)?
}];

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ static void convertBindingUsage(
auto newOp = builder.create<IREE::HAL::InterfaceBindingSubspanOp>(
oldOp.getLoc(), oldOp.getType(), APInt(64, setLayoutAttr.getOrdinal()),
APInt(64, bindingAttr.getOrdinal()), bindingAttr.getType(),
oldOp.getByteOffset(), oldOp.getDynamicDims(), alignmentAttr);
oldOp.getByteOffset(), oldOp.getDynamicDims(), alignmentAttr,
bindingAttr.getFlags());
oldOp.replaceAllUsesWith(newOp.getResult());
oldOp.erase();
}
Expand Down
4 changes: 2 additions & 2 deletions tests/transform_dialect/cpu/matmul.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ func.func @matmul_static(
// CODEGEN-CUSTOM-DISPATCH-FORMATION: builtin.module {
// CODEGEN-CUSTOM-DISPATCH-FORMATION: func.func @matmul_static_dispatch_0_matmul_3x3x5() {
// CODEGEN-CUSTOM-DISPATCH-FORMATION: arith.constant 0 : index
// CODEGEN-CUSTOM-DISPATCH-FORMATION: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x5xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset({{.*}}) alignment(64) flags(ReadOnly) : memref<3x5xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: memref.assume_alignment %{{.*}}, 64 : memref<3x5xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset({{.*}}) alignment(64) : memref<5x3xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset({{.*}}) alignment(64) flags(ReadOnly) : memref<5x3xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: memref.assume_alignment %{{.*}}, 64 : memref<5x3xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset({{.*}}) alignment(64) : memref<3x3xf32>
// CODEGEN-CUSTOM-DISPATCH-FORMATION: memref.assume_alignment %{{.*}}, 64 : memref<3x3xf32>
Expand Down

0 comments on commit a922768

Please sign in to comment.