Skip to content

Commit

Permalink
[GPU] Enable implicit concat batch1 in oneDNN. (openvinotoolkit#9424)
Browse files Browse the repository at this point in the history
* [GPU] Enable implicit concat batch1 in oneDNN.

* Use gpu_usm memory offset, enable implicit concat batch1 in oneDNN.
  And optimized_out node doesn't always have to be mutable input,
  so add to check whether mutable input is existed in optimized node.
* Update to check use_usm condition in implicit concat.
* Add the condition for implicit concat.
* implicit concat's dependency should not be fused_op with eltwise.
* Buffer reuse is required for onednn sum post operation, output padding
did the buffer reuse failure.

Signed-off-by: hyunback <[email protected]>
  • Loading branch information
hyunback authored Jan 6, 2022
1 parent e89db1c commit 89f48e0
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 32 deletions.
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ struct memory {
virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) = 0;

#ifdef ENABLE_ONEDNN_FOR_GPU
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */) {
virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) {
throw std::runtime_error("[CLDNN] Can't convert memory object to onednn");
}
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ void add_required_reorders::run(program& p) {
continue; // only nodes with dependencies
if (usr->is_type<data>())
continue;

if (usr->type()->does_an_implementation_exist(*usr)) {
if (usr->get_preferred_impl_type() != impl_types::onednn) {
continue;
Expand All @@ -62,17 +61,40 @@ void add_required_reorders::run(program& p) {
if (!input.is_in_data_flow() || input.is_constant())
continue;

if (static_cast<bool>(input.get_output_layout().data_padding)) {
cldnn::layout layout_wo_padding = input.get_output_layout();
layout_wo_padding.data_padding = cldnn::padding{};
auto new_reorder = std::make_shared<reorder>(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, i);
auto in_padding = input.get_output_layout().data_padding;
if (static_cast<bool>(in_padding)) {
bool spatial_padding = false;
for (size_t i = 0; i < in_padding.lower_size().spatial.size(); ++i) {
spatial_padding |= (in_padding.lower_size().spatial[i] != 0);
}
for (size_t i = 0; i < in_padding.upper_size().spatial.size(); ++i) {
spatial_padding |= (in_padding.upper_size().spatial[i] != 0);
}
bool batch_padding = false;
for (size_t i = 0; i < in_padding.lower_size().batch.size(); ++i) {
batch_padding |= (in_padding.lower_size().batch[i] != 0);
}
for (size_t i = 0; i < in_padding.upper_size().batch.size(); ++i) {
batch_padding |= (in_padding.upper_size().batch[i] != 0);
}
if (spatial_padding || batch_padding) {
cldnn::layout layout_padding = input.get_output_layout();
cldnn::layout layout_wo_padding = input.get_output_layout();
layout_wo_padding.data_padding = cldnn::padding{};
layout_wo_padding.data_padding.lower_size().feature = layout_padding.data_padding.lower_size().feature;
layout_wo_padding.data_padding.upper_size().feature = layout_padding.data_padding.upper_size().feature;
auto new_reorder = std::make_shared<reorder>(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, i);
} else {
continue;
}
}
}
continue;
}
}

bool correct_layout_selected = false;
bool weights_data = (usr->is_type<convolution>() || usr->is_type<deconvolution>() ||
usr->is_type<deformable_conv>() || usr->is_type<fully_connected>());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,42 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
return false;

// For in place concatenation input layouts and data types must match.
auto output_format = node.get_output_layout().format;
auto output_datatype = node.get_output_layout().data_type;
auto concat_axis = node.get_primitive()->axis;
bool is_onednn_impl = false;

// oneDNN doens't support paddings and such concat optimizations
for (auto& input : node.get_dependencies()) {
if (input->get_preferred_impl_type() == impl_types::onednn)
if (input->get_preferred_impl_type() == impl_types::onednn) {
for (auto& fused_op : input->get_fused_primitives()) {
if (fused_op.node->is_type<eltwise>() && fused_op.deps.size() == 1) {
auto& eltw_in = input->get_dependency(fused_op.dep_start_idx);
auto eltw_in_layout = eltw_in.get_output_layout();
auto out_layout = input->get_output_layout();

if (!fused_op.node->as<eltwise>().get_primitive()->needs_onednn_sum_post_op(eltw_in_layout))
continue;
if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout))
return false;
}
}
is_onednn_impl = true;
}
}

// Implicit concat for onednn only when use_usm and batch 1.
if (is_onednn_impl) {
bool use_usm = node.get_program().get_engine().use_unified_shared_memory();
layout out_l = node.get_output_layout();

if (!use_usm)
return false;
if (out_l.size.batch[0] > 1)
return false;
}

// For in place concatenation input layouts and data types must match.
auto output_format = node.get_output_layout().format;
auto output_datatype = node.get_output_layout().data_type;
auto concat_axis = node.get_primitive()->axis;

for (auto& input : node.get_dependencies()) {
if (input->is_type<reshape>())
// reshapes should be optimized out.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,14 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {

{
auto& input = instance.input_memory(0);
args.insert({DNNL_ARG_SRC, input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(0))});
auto offset = onednn::get_offset(_pd.dnnl::primitive_desc_base::src_desc(0));
args.insert({DNNL_ARG_SRC, input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(0), offset)});
}

{
auto& output = instance.output_memory();
args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0))});
auto offset = onednn::get_offset(_pd.dnnl::primitive_desc_base::dst_desc(0));
args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0), offset)});
}

configure_post_ops_arguments(instance, args);
Expand Down Expand Up @@ -200,7 +202,9 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
event = stream.create_user_event(false);
}

_prim.execute(stream.get_onednn_stream(), _args[net_id]);
if (!instance.can_be_optimized()) {
_prim.execute(stream.get_onednn_stream(), _args[net_id]);
}

if (profiling) {
stream.finish();
Expand Down
31 changes: 31 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,37 @@ void combine_bf_with_first_spatial_dim(cldnn::layout& l) {
l.size.spatial[last_spatial_dim_idx] = 1;
}

int64_t get_offset(dnnl::memory::desc desc) {
int64_t offset = 0;
int32_t padded_idx = -1;
for (int32_t i = 0; i < DNNL_MAX_NDIMS; ++i) {
if (desc.data.padded_offsets[i] > 0) {
padded_idx = i;
break;
}
}
if (padded_idx > -1) {
if (padded_idx != 1)
throw std::runtime_error(std::string("onednn only support feature padding. Unsupported padded_idx: ") + std::to_string(padded_idx));
offset = desc.data.padded_offsets[padded_idx];
for (int32_t i = padded_idx + 1; i < desc.data.ndims; ++i) {
offset *= desc.data.padded_dims[i];
}
}
switch (desc.data.data_type) {
case dnnl_data_type_t::dnnl_s8:
case dnnl_data_type_t::dnnl_u8:
return offset;
case dnnl_data_type_t::dnnl_f16:
case dnnl_data_type_t::dnnl_bf16:
return (offset * 2);
case dnnl_data_type_t::dnnl_f32:
case dnnl_data_type_t::dnnl_s32:
return (offset * 4);
default: throw std::runtime_error(std::string("Unsupported offset for dnnl_data_type_t ") + dnnl_dt2str(desc.data.data_type));
}
}

dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_tag target_fmt, bool flatten) {
dnnl::memory::dims dims;
dnnl::memory::dims padded_dims;
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ dnnl::algorithm convert_activation_func(cldnn::activation_func func);
// onednn -> cldnn
cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped = false);

int64_t get_offset(dnnl::memory::desc desc);

// If the values in the tensor are identical, make it as per-tensor value
template <typename T>
void make_per_tensor_if_possible(cldnn::data_node& node);
Expand Down
38 changes: 32 additions & 6 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1178,13 +1178,39 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) {
}

bool layout_optimizer::are_layouts_suitable_for_onednn(program_node& node) {
auto in_layout = node.get_dependencies().front()->get_output_layout();
auto out_layout = node.get_output_layout();
auto in_padding = node.get_dependencies().front()->get_output_layout().data_padding;
auto out_padding = node.get_output_layout().data_padding;
// Check if padding exists
if (node.get_preferred_impl_type() == impl_types::onednn && (in_layout.data_padding || out_layout.data_padding))
return false;
else
return true;
if (node.get_preferred_impl_type() == impl_types::onednn && (in_padding || out_padding)) {
bool no_spatial_padding = true;
for (size_t i = 0; i < in_padding.lower_size().spatial.size(); ++i) {
no_spatial_padding &= (in_padding.lower_size().spatial[i] == 0);
}
for (size_t i = 0; i < in_padding.upper_size().spatial.size(); ++i) {
no_spatial_padding &= (in_padding.upper_size().spatial[i] == 0);
}
for (size_t i = 0; i < out_padding.lower_size().spatial.size(); ++i) {
no_spatial_padding &= (out_padding.lower_size().spatial[i] == 0);
}
for (size_t i = 0; i < out_padding.upper_size().spatial.size(); ++i) {
no_spatial_padding &= (out_padding.upper_size().spatial[i] == 0);
}
bool no_batch_padding = true;
for (size_t i = 0; i < in_padding.lower_size().batch.size(); ++i) {
no_batch_padding &= (in_padding.lower_size().batch[i] == 0);
}
for (size_t i = 0; i < in_padding.upper_size().batch.size(); ++i) {
no_batch_padding &= (in_padding.upper_size().batch[i] == 0);
}
for (size_t i = 0; i < out_padding.lower_size().batch.size(); ++i) {
no_batch_padding &= (out_padding.lower_size().batch[i] == 0);
}
for (size_t i = 0; i < out_padding.upper_size().batch.size(); ++i) {
no_batch_padding &= (out_padding.upper_size().batch[i] == 0);
}
return (no_spatial_padding && no_batch_padding);
}
return true;
}

impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format preferred_format) {
Expand Down
21 changes: 17 additions & 4 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include <set>
#include <utility>
#include <map>
#include <functional>

#ifdef GPU_DEBUG_CONFIG
#include <iomanip>
Expand Down Expand Up @@ -835,11 +836,23 @@ void network::allocate_primitive_instance(program_node const& node) {
return;

auto inst = node.type()->create_instance(*this, node);
for (auto& dep : node.get_dependencies()) {
if (dep->is_type<input_layout>() || dep->is_type<mutable_data>() || dep->can_be_optimized()) {
inst->set_mutable_input(true);
break;

std::function<bool(const program_node&)> is_mutable_input = [&is_mutable_input](const program_node& node) {
for (auto& dep : node.get_dependencies()) {
if (dep->is_type<input_layout>() || dep->is_type<mutable_data>()) {
return true;
}
if (dep->can_be_optimized()) {
if (is_mutable_input(*dep)) {
return true;
}
}
}
return false;
};

if (is_mutable_input(node)) {
inst->set_mutable_input(true);
}

_primitives[node.id()] = inst;
Expand Down
7 changes: 4 additions & 3 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr) {
}

#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc) {
dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
auto onednn_engine = _engine->get_onednn_engine();
dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE);
dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get());
Expand Down Expand Up @@ -396,9 +396,10 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr) {
}

#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc) {
dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) {
auto onednn_engine = _engine->get_onednn_engine();
dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm, _buffer.get());
dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm,
reinterpret_cast<uint8_t*>(_buffer.get()) + offset);
return dnnl_mem;
}
#endif
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory {
event::ptr copy_from(stream& stream, const memory& other) override;
event::ptr copy_from(stream& stream, const void* host_ptr) override;
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */) override;
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
#endif

protected:
Expand Down Expand Up @@ -116,7 +116,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory {
event::ptr copy_from(stream& stream, const void* host_ptr) override;

#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::memory get_onednn_memory(dnnl::memory::desc desc) override;
dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override;
#endif

protected:
Expand Down

0 comments on commit 89f48e0

Please sign in to comment.