Skip to content

Commit

Permalink
[GPU] Fixed fused_primitive_desc to have -1 value for dep_start_idx (o…
Browse files Browse the repository at this point in the history
…penvinotoolkit#17099)

* Fixed fused_primitive_desc to have -1 value for dep_start_idxt b

* Fixed dgpu i8 errors
  • Loading branch information
yeonbok authored Apr 24, 2023
1 parent 3830125 commit ce23ce0
Show file tree
Hide file tree
Showing 15 changed files with 170 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,16 @@ struct fused_primitive_desc {
bool operator==(const fused_primitive_desc& rhs) const {
if (total_num_deps != rhs.total_num_deps)
return false;
if (dep_start_idx != rhs.dep_start_idx)
if (outer_dep_start_idx != rhs.outer_dep_start_idx)
return false;

return *desc == *rhs.desc;
}

bool operator!=(const fused_primitive_desc& rhs) const { return !(*this == rhs); }

bool has_outer_dep() const { return outer_dep_start_idx >= 0; }

std::shared_ptr<const primitive> desc;

layout input_layout = layout(data_types::f32, format::bfyx, tensor());
Expand All @@ -61,7 +63,11 @@ struct fused_primitive_desc {

std::vector<std::pair<primitive_id, size_t>> deps;
std::map<primitive_id, size_t> fused_deps;
size_t dep_start_idx;
// TODO:
// Currently, it assumes very simple case where dep 0 is the fused node and no input sharing b/w fused node and peer node
// To cover such cases where some of the peer node uses input of fused node, we need to maintain actual indexes of the dependencies
// not only the "starting index".
int32_t outer_dep_start_idx = -1; // if -1, no external dep after fusing
size_t total_num_deps = 0;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,9 @@ void add_required_reorders::run(program& p) {
if (!fused_op.is_type<eltwise>() && !(fused_op.is_type<activation>() && fused_op.total_num_deps == 2))
continue;

auto dep_id = fused_op.dep_start_idx;
if (dep_id >= usr->get_dependencies().size())
if (!fused_op.has_outer_dep())
continue;

auto dep_id = fused_op.outer_dep_start_idx;
auto& dep = usr->get_dependency(dep_id);
if (!dep.is_type<data>())
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@ void basic_memory_dependencies::run(program& p) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;

eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_node = node->get_dependency(eltw_dep);
eltw_node.can_share_buffer(false);
node->can_share_buffer(false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ void remove_redundant_reorders::run(program& p) {
local_desc.input_layout = input.get_dependency(0).get_output_layout(); // original convolution's output layout
node->set_input_layout(local_desc.input_layout);
local_desc.f_param = node->get_fuse_params();
local_desc.dep_start_idx = input.get_fused_primitives().size();
local_desc.outer_dep_start_idx = -1;
local_desc.output_layout = output_layout;
input.add_fused_primitive(local_desc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -862,8 +862,8 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
onednn_add_fusing_helpers::for_eltwise(conv_node, eltwise_mode::sum,
[&](const program_node& p_node, const fused_primitive_desc& desc) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(p_node, desc);
if (fusing_type == add_fusing_type::binary_per_tensor) {
auto& dep_node = p_node.get_dependency(desc.dep_start_idx);
if (fusing_type == add_fusing_type::binary_per_tensor && desc.has_outer_dep()) {
auto& dep_node = p_node.get_dependency(desc.outer_dep_start_idx);
auto d_layout = dep_node.get_output_layout();
auto d_format = d_layout.format;
auto expected_format = format::any;
Expand All @@ -885,9 +885,9 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
new_layout.format = expected_format;
auto new_input = rf.get_reorder(dep_node.id(), d_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, conv_node, desc.dep_start_idx, !new_input.second);
p.add_intermediate(new_input.first, conv_node, desc.outer_dep_start_idx, !new_input.second);
}
conv_node.get_dependency(desc.dep_start_idx).set_output_layout(new_layout, false);
conv_node.get_dependency(desc.outer_dep_start_idx).set_output_layout(new_layout, false);
}
}
});
Expand Down Expand Up @@ -965,7 +965,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
if (activation_desc->activation_function == cldnn::activation_func::relu_negative_slope &&
!activation_desc->additional_params_input.empty()) {
const auto expected_dt = data_types::f32;
const auto dep_idx = fused_desc.dep_start_idx;
const auto dep_idx = fused_desc.outer_dep_start_idx;
const auto orig_layout = node->get_dependency(dep_idx).get_output_layout();
if (orig_layout.data_type == expected_dt)
continue;
Expand All @@ -992,7 +992,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
for (const auto& fused_prim : node->get_fused_primitives()) {
if (fused_prim.is_type<eltwise>() &&
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);

auto gemm_layout = node->get_output_layout();
auto data_layout = data.get_output_layout();
Expand All @@ -1016,7 +1016,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), gemm_layout.get_shape(), ov::AxisSet{});

auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);
}
}
Expand All @@ -1025,7 +1025,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
if (fused_prim.is_type<eltwise>() &&
one_of(fused_prim.typed_desc<eltwise>()->mode, {eltwise_mode::sum, eltwise_mode::sub, eltwise_mode::prod})) {
auto fc_layout = node->get_output_layout();
auto& data = node->get_dependency(fused_prim.dep_start_idx);
auto& data = node->get_dependency(fused_prim.outer_dep_start_idx);
auto data_layout = data.get_output_layout();

if (fc_layout.is_dynamic() || data_layout.is_dynamic())
Expand Down Expand Up @@ -1060,7 +1060,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
auto broadcast_prim = std::make_shared<cldnn::broadcast>(prim_id, cldnn::input_info(data.id()), fc_layout.get_shape(), ov::AxisSet{});

auto& broadcast_node = p.get_or_create(broadcast_prim);
p.add_intermediate(broadcast_node, *node, fused_prim.dep_start_idx, true);
p.add_intermediate(broadcast_node, *node, fused_prim.outer_dep_start_idx, true);
broadcast_node.recalc_output_layouts(false);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,15 @@ inline params_t get_default_params(const kernel_impl_params& param_info, bool is
OPENVINO_ASSERT(desc.op_params != nullptr, "[GPU] Invalid fused operation (", param_info.desc->id , ") of type ", param_info.desc->type_string());


desc.dep_idx_start = fused_prim.dep_start_idx;
desc.dep_idx_start = fused_prim.outer_dep_start_idx;
desc.dep_size = fused_prim.deps.size();
desc.op_id = op_id++;
desc.output_tensor = convert_data_tensor(fused_prim.output_layout);
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());

for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
if (fused_prim.has_outer_dep()) {
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
}
}

if (fused_prim.total_num_deps > 0) {
Expand Down Expand Up @@ -334,7 +335,7 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
if (fd.is_type<eltwise>() && fd.total_num_deps == 2) {
auto out_pshape = updated_impl_params.output_layouts[0].get_partial_shape();

auto& dep_layout = updated_impl_params.input_layouts[fd.dep_start_idx];
auto& dep_layout = updated_impl_params.input_layouts[fd.outer_dep_start_idx];
auto dep_shape = dep_layout.get_partial_shape();

if (!broadcastable(dep_shape, out_pshape, use_new_shape_infer)) {
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ class primitive_inst {
bool can_share_buffer() const { return _can_share_buffer; }
bool is_constant() const { return _is_constant; }
bool is_output_event() const { return _is_output_event; }
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }

void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
Expand Down
18 changes: 18 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,24 @@ struct program_node {

bool is_fused_dep(size_t dep_idx) const;

bool has_fused_dep() const {
for (auto fused : get_fused_primitives()) {
if (fused.has_outer_dep())
return true;
}
return false;
}

int32_t get_first_fused_dep_idx() const {
if (!has_fused_dep())
return -1;
for (auto fused : get_fused_primitives()) {
if (fused.has_outer_dep())
return fused.outer_dep_start_idx;
}
return -1;
}

std::map<size_t, memory::ptr> get_const_memory_deps() const;

virtual std::unique_ptr<kernel_impl_params> get_kernel_impl_params() const {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
for (auto& p : next.get_fused_primitives()) {
// find eltwise sum primitive which has dependency nodes, and gather dependency indices of it.
if (p.is_type<eltwise>() && p.typed_desc<eltwise>()->mode == eltwise_mode::sum) {
for (size_t i = p.dep_start_idx; i < p.dep_start_idx + p.total_num_deps; i++) {
for (size_t i = p.outer_dep_start_idx; i < p.outer_dep_start_idx + p.total_num_deps; i++) {
dep_idx_set.insert(i);
}
}
Expand Down
8 changes: 6 additions & 2 deletions src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,7 +604,9 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
reuse_map[node->id()] = eltw_in.id();
Expand Down Expand Up @@ -1007,7 +1009,9 @@ void network::allocate_primitives() {
auto fusing_type = onednn_add_fusing_helpers::get_add_fusing_type(*node, fused_op);
if (fusing_type != add_fusing_type::sum || eltw_dep != 0)
continue;
eltw_dep = fused_op.dep_start_idx;
if (!fused_op.has_outer_dep())
continue;
eltw_dep = fused_op.outer_dep_start_idx;
auto& eltw_in = node->get_dependency(eltw_dep);
if (_primitives.find(eltw_in.id()) != _primitives.end() && _primitives.find(node->id()) != _primitives.end()) {
auto& eltw_inst = _primitives.at(eltw_in.id());
Expand Down
48 changes: 26 additions & 22 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
, _inputs_memory_count(node.get_primitive()->input_size())
, _outputs_memory_count(node.get_primitive()->output_size())
, _fused_mem_count(node.get_fused_inputs_count())
, _fused_mem_offset(_fused_mem_count > 0 ? node.get_fused_primitives()[0].dep_start_idx : 0)
, _fused_mem_offset((_fused_mem_count > 0 && node.has_fused_dep()) ? node.get_first_fused_dep_idx() : 0)
, _can_be_optimized(node.can_be_optimized())
, _can_share_buffer(node.can_share_buffer())
, _is_constant(node.is_constant()) {
Expand Down Expand Up @@ -971,7 +971,7 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
if (!_unfused_subgraph) {
topology t;

std::vector<primitive_id> dep_ids;
std::vector<primitive_id> outer_dep_ids;
// Add input primitives: constants are moved as is
// Any other primitive types are replaced with input_layout
for (auto& dep : _node->get_dependencies()) {
Expand All @@ -985,12 +985,12 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
input_layout in_prim(dep.first->id(), dep.first->get_output_layout());
t.add(in_prim);
}
dep_ids.push_back(dep.first->id());
outer_dep_ids.push_back(dep.first->id());
}

// Create the primitive itself
t.add_primitive(std::const_pointer_cast<primitive>(_node->get_primitive()));
dep_ids.push_back(_node->id());
outer_dep_ids.push_back(_node->id());

// Add primitives for fused-ops
for (auto& fd : _impl_params->fused_desc) {
Expand All @@ -1008,25 +1008,26 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
// And when we construct unfused subgraph for prim2, we take original eltwise2 primitive which expects eltwise1 primitive as input
// which doesn't exist anymore in the graph
// Thus we update dependency name used dependencies idx stored in fused descriptor.
if (std::find_if(dep_ids.begin(), dep_ids.end(),
[&](const primitive_id& pid) {
return pid == in.pid;
}) == dep_ids.end()) {
size_t dep_id = fd.dep_start_idx;
in = _node->get_dependency(dep_id).id();
if (fd.has_outer_dep()) {
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(), [&](const primitive_id& pid) {
return pid == in.pid;
}) == outer_dep_ids.end()) {
size_t dep_id = fd.outer_dep_start_idx;
in = _node->get_dependency(dep_id).id();
}
}
}
t.add_primitive(prim);
dep_ids.push_back(prim->id);
outer_dep_ids.push_back(prim->id);
}
// Samely, need to update dependency of the current fused nodes' input primitive ids with those in the current program
auto prim_of_fused_node = std::const_pointer_cast<primitive>(_impl_params->desc);
for (size_t i = 0; i < prim_of_fused_node->input.size(); ++i) {
auto& in = prim_of_fused_node->input[i];
if (std::find_if(dep_ids.begin(), dep_ids.end(),
if (std::find_if(outer_dep_ids.begin(), outer_dep_ids.end(),
[&](const primitive_id& pid) {
return pid == in.pid;
}) == dep_ids.end()) {
}) == outer_dep_ids.end()) {
in = _node->get_dependency(i).id();
}
}
Expand All @@ -1048,11 +1049,12 @@ bool primitive_inst::is_valid_fusion() const {
auto fuse_descriptors = _impl_params->fused_desc;
if (fuse_descriptors.empty())
return true;

std::vector<fused_primitive_desc> fused_eltwise_prims;
for (auto& fd : fuse_descriptors) {
if (fd.is_type<eltwise>()) {
if (fd.is_type<eltwise>() || fd.is_type<activation>()) {
fused_eltwise_prims.push_back(fd);
} else {
OPENVINO_ASSERT("[GPU] Unsupported fused operation in dynamic shape : ", fd.desc->id);
}
}

Expand All @@ -1061,14 +1063,16 @@ bool primitive_inst::is_valid_fusion() const {

auto out_pshape = _impl_params->get_output_layout().get_partial_shape();
for (auto& fd : fused_eltwise_prims) {
auto dep_idx = fd.dep_start_idx;
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise");
OPENVINO_ASSERT(_deps.size() > dep_idx, "[GPU] Invalid fused dependency idx");
auto dep = _deps[dep_idx];
auto outer_dep_idx = fd.outer_dep_start_idx;
if (outer_dep_idx < 0) // no outer dep
continue;
OPENVINO_ASSERT(fd.total_num_deps == 2, "[GPU] Unexpected count of dependencies in dynamic fusion for eltwise or activation");
OPENVINO_ASSERT(outer_dep_idx < 0 || static_cast<int32_t>(_deps.size()) > outer_dep_idx, "[GPU] Invalid fused dependency idx");
auto outer_dep = _deps[outer_dep_idx];

auto dep_pshape = dep.first->_impl_params->get_output_layout().get_partial_shape();
auto outer_dep_pshape = outer_dep.first->_impl_params->get_output_layout().get_partial_shape();
auto merged_shape = out_pshape;
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);
auto can_broadcast = ov::PartialShape::broadcast_merge_into(merged_shape, outer_dep_pshape, fd.typed_desc<eltwise>()->broadcast_spec);

#ifdef ENABLE_ONEDNN_FOR_GPU
// WA for OneDNN binary add fusions: we need to broadcast batch dimension to avoid situation with
Expand All @@ -1079,7 +1083,7 @@ bool primitive_inst::is_valid_fusion() const {
// correctly and we need to do it manually
if (_node->is_type<gemm>() && _node->get_preferred_impl_type() == impl_types::onednn) {
auto gemm_layout = _impl_params->get_output_layout();
auto data_layout = dep.first->_impl_params->get_output_layout();
auto data_layout = outer_dep.first->_impl_params->get_output_layout();
auto gemm_dims = onednn::convert_gemm_tensor(gemm_layout.get_tensor(),
cldnn::format::dimension(gemm_layout.format),
false);
Expand Down
Loading

0 comments on commit ce23ce0

Please sign in to comment.