Skip to content

Commit

Permalink
[GPU] Baseline primitive, primitive_inst, program_node change for mul…
Browse files Browse the repository at this point in the history
…tiple outputs support (openvinotoolkit#13295)

* Update primitive, primitive_inst, and program_node for multiple outputs support

Signed-off-by: Andrew Park <[email protected]>

* Update arg_max_min kernel for multiple outputs support

Signed-off-by: Andrew Park <[email protected]>

* Fix failed unittests TCs

- remove duplicate output memory

Signed-off-by: Andrew Park <[email protected]>

* Clean up code

Signed-off-by: Andrew Park <[email protected]>

* Apply code review

Signed-off-by: Andrew Park <[email protected]>

Signed-off-by: Andrew Park <[email protected]>
  • Loading branch information
andrew-k-park authored Oct 17, 2022
1 parent f1d816f commit 550e590
Show file tree
Hide file tree
Showing 27 changed files with 309 additions and 73 deletions.
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ struct network {
bool has_event(const primitive_id& id) const { return _events.count(id); }
std::vector<std::shared_ptr<primitive_inst>> get_primitives(const std::vector<primitive_id>& ids);
std::vector<std::shared_ptr<primitive_inst>> get_primitives(const std::vector<program_node*>& nodes);
std::vector<std::pair<std::shared_ptr<primitive_inst>, int>> get_primitives(const std::vector<std::pair<program_node*, int>>& nodes);
void execute_primitive(const std::shared_ptr<primitive_inst>& primitive,
const std::vector<event::ptr>& events);
void allocate_primitives();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ struct arg_max_min : public primitive_base<arg_max_min> {
ov::op::TopKSortType sort = ov::op::TopKSortType::SORT_VALUES,
bool values_first = false,
const padding& output_padding = padding(),
data_types output_data_type = data_types::f32)
: primitive_base(id, {input}, output_padding, optional_data_type {output_data_type}),
data_types output_data_type = data_types::f32,
const std::vector<input_info>& inputs = {},
const int num_outputs = 1)
: primitive_base(id, {input}, output_padding, optional_data_type{output_data_type}, inputs, num_outputs),
mode(mode),
top_k(top_k),
axis(axis),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,9 @@ struct concatenation : public primitive_base<concatenation> {
const std::vector<primitive_id>& input,
const int64_t axis,
const data_types output_dt,
const padding& output_padding = padding())
: primitive_base(id, {input}, output_padding, optional_data_type{output_dt}), axis(axis) {}
const padding& output_padding = padding(),
const std::vector<input_info>& inputs = {})
: primitive_base(id, {input}, output_padding, optional_data_type{output_dt}, inputs), axis(axis) {}

/// @brief Dimension along which concatenation should take place
int64_t axis;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ struct permute : public primitive_base<permute> {
permute(const primitive_id& id,
const primitive_id& input,
const std::vector<uint16_t>& permute_order = {},
const padding& output_padding = padding())
: primitive_base(id, {input}, output_padding), permute_order(permute_order) { }
const padding& output_padding = padding(),
const std::vector<input_info>& inputs = {})
: primitive_base(id, {input}, output_padding, optional_data_type(), inputs), permute_order(permute_order) { }

/// @brief Array of permuted output order in bfyx format.
std::vector<uint16_t> permute_order;
Expand Down
61 changes: 57 additions & 4 deletions src/plugins/intel_gpu/include/intel_gpu/primitives/primitive.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,26 @@ using primitive_id = std::string;

struct primitive_info;

struct input_info {
input_info() : pid(""), idx(0) {}
input_info(primitive_id pid) : pid(pid), idx(0) {}
input_info(primitive_id pid, int idx) : pid(pid), idx(idx) {}

primitive_id pid;
int32_t idx;
struct cmp {
bool operator() (const input_info a, const input_info b) {
if (a.pid < b.pid) {
return true;
} else if (a.pid == b.pid) {
return a.idx < b.idx;
} else {
return false;
}
}
};
};

/// @brief Base class of network primitive description.
struct primitive {
public:
Expand All @@ -40,12 +60,16 @@ struct primitive {
const primitive_id& id,
const std::vector<primitive_id>& input,
const padding& output_padding = padding(),
const optional_data_type output_data_type = optional_data_type())
const optional_data_type output_data_type = optional_data_type(),
const std::vector<input_info>& input_new = {},
const size_t num_outputs = 1)
: type(type),
id(id),
output_padding(output_padding),
output_data_type(output_data_type),
input(input) {}
input(input),
input_new(input_new),
num_outputs(num_outputs) {}

virtual ~primitive() = default;

Expand All @@ -69,6 +93,23 @@ struct primitive {
return result;
}

std::vector<std::reference_wrapper<input_info>> dependencies_new() {
std::vector<std::reference_wrapper<input_info>> result;
auto&& deps = get_dependencies_new();
result.reserve(input_new.size() + deps.size());
for (auto& i : input_new) result.push_back(std::ref(i));
for (auto& dep : deps) result.push_back({std::ref(const_cast<input_info&>(dep.get()))});

return result;
}

std::vector<input_info> dependencies_new() const {
auto result = input_new;
auto deps = get_dependencies_new();
result.insert(result.end(), deps.begin(), deps.end());
return result;
}

virtual primitive_id type_string() const = 0;

/// @brief Implicit conversion to primiitive id.
Expand All @@ -94,13 +135,23 @@ struct primitive {

size_t input_size() const { return input.size(); }

size_t output_size() const { return num_outputs; }

using primitive_id_arr = std::vector<primitive_id>;

/// @brief List of ids of input primitives.
primitive_id_arr input;

using input_info_arr = std::vector<input_info>;

/// @brief List of input info containing id and output index of input primitive.
input_info_arr input_new;

size_t num_outputs;

protected:
virtual std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const { return {}; }
virtual std::vector<std::reference_wrapper<const input_info>> get_dependencies_new() const { return {}; }
class condition;
friend struct primitive_info;
};
Expand All @@ -112,8 +163,10 @@ class primitive_base : public primitive {
explicit primitive_base(const primitive_id& id,
const std::vector<primitive_id>& input,
const padding& output_padding = padding(),
optional_data_type output_data_type = optional_data_type())
: primitive(PType::type_id(), id, input, output_padding, output_data_type) {}
optional_data_type output_data_type = optional_data_type(),
const std::vector<input_info>& input_new = {},
const size_t num_outputs = 1)
: primitive(PType::type_id(), id, input, output_padding, output_data_type, input_new, num_outputs) {}
};

struct primitive_info {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/concatenation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ concatenation_inst::typed_primitive_inst(network& network, concatenation_node co
stack.pop_front();

for (auto processed_node : *nodes_list) {
processed_node->_output = _output;
processed_node->_outputs = _outputs;
if (processed_node->type() == concatenation::type_id() && processed_node->can_be_optimized()) {
if (!processed_node->_deps.empty())
stack.push_back(&processed_node->_deps);
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/graph/crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,13 @@ void crop_inst::on_execute() {
if (!node.can_be_optimized())
return;

if (_output && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
if (_outputs[0] && _network.get_engine().is_the_same_buffer(output_memory(), input_memory()))
return;

reuse_input();
}

void crop_inst::reuse_input() {
_output = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
_outputs[0] = _network.get_engine().reinterpret_buffer(input_memory(), node.get_output_layout());
}
} // namespace cldnn
14 changes: 10 additions & 4 deletions src/plugins/intel_gpu/src/graph/impls/ocl/arg_max_min.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ struct arg_max_min_impl : typed_primitive_impl_ocl<arg_max_min> {
kernel_arguments_data get_arguments(typed_primitive_inst<arg_max_min>& instance, int32_t) const override {
kernel_arguments_data args = parent::get_arguments(instance, 0);

if (args.inputs.size() == 3) {
if (instance.node.has_second_output()) {
args.inputs.erase(args.inputs.begin() + 1); // erase constant input in case of TOP_K
}

Expand All @@ -63,7 +63,7 @@ struct arg_max_min_impl : typed_primitive_impl_ocl<arg_max_min> {
const auto& mode = primitive->mode;
const auto& sort_type = primitive->sort;
const auto& values_first = primitive->values_first;
const auto& outputs_num = primitive->input.size() == 3 ? 2 : 1; // second output passed as input for TOP_K layer
const auto& outputs_num = arg.get_output_nums(); // second output passed as input for TOP_K layer

auto argm_params = get_default_params<kernel_selector::arg_max_min_params>(impl_param);
auto argm_optional_params =
Expand All @@ -83,8 +83,14 @@ struct arg_max_min_impl : typed_primitive_impl_ocl<arg_max_min> {
else
argm_params.argMaxMinSortType = kernel_selector::argm_sort::INDEX;

if (outputs_num == 2) {
argm_params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[2]));
if (arg.has_second_output()) { // for backward compatibility
argm_params.has_second_output = true;
if (arg.use_multiple_outputs()) {
argm_params.use_multiple_outputs = true;
argm_params.outputs.push_back(convert_data_tensor(impl_param.output_layout));
} else {
argm_params.inputs.push_back(convert_data_tensor(impl_param.input_layouts[2]));
}
}

argm_params.values_first = values_first;
Expand Down
6 changes: 4 additions & 2 deletions src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
args.fused_op_inputs.push_back(instance.fused_memory(i));
}
}
// TODO: support multiple outputs
args.outputs.push_back(instance.output_memory_ptr());

for (size_t i = 0; i < instance.outputs_memory_count(); i++) {
args.outputs.push_back(instance.output_memory_ptr(i));
}

return args;
}
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/arg_max_min_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ struct typed_program_node<arg_max_min> : public typed_program_node_base<arg_max_
public:
typed_program_node(std::shared_ptr<primitive> prim, program& prog) : parent(prim, prog) {}
program_node& input() const { return get_dependency(0); }

uint32_t get_output_nums() const {
return (get_primitive()->input_size() == 3 ? 2 : get_primitive()->output_size());
}
bool has_second_output() const { return get_output_nums() == 2; }
bool use_multiple_outputs() const { return get_primitive()->input_size() != 3; }
};

using arg_max_min_node = typed_program_node<arg_max_min>;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/include/loop_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ class typed_primitive_inst<loop> : public typed_primitive_inst_base<loop> {
void preprocess_output_memory();
void preprocess_backedge_memory();
void update_mapped_memory();
void set_output_memory(memory::ptr mem, bool check = true) override;
void set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0) override;
const backedge_memory_mapping& get_current_iteration_backedge_mapping() const {
if (!node.is_current_iteration_used()) {
CLDNN_ERROR_MESSAGE(node.id(), "no backedge mapping for current_iteration");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class typed_primitive_inst<mutable_data> : public typed_primitive_inst_base<muta
static std::string to_string(mutable_data_node const& node);

typed_primitive_inst(network& network, mutable_data_node const& node);
void set_output_memory(memory::ptr mem, bool check = true) override;
void set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0) override;
};

using mutable_data_inst = typed_primitive_inst<mutable_data>;
Expand Down
41 changes: 33 additions & 8 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,35 @@ class primitive_inst {
return reinterpret_cast<std::vector<std::shared_ptr<const primitive_inst>> const&>(_deps);
}

memory& dep_memory(size_t index) const { return dependencies().at(index)->output_memory(); }
memory::ptr dep_memory_ptr(size_t index) const { return dependencies().at(index)->output_memory_ptr(); }
memory& output_memory() const { return *_output; }
memory::ptr output_memory_ptr() const { return _output; }
const std::vector<std::pair<std::shared_ptr<const primitive_inst>, int32_t>>& dependencies_new() const {
return reinterpret_cast<std::vector<std::pair<std::shared_ptr<const primitive_inst>, int32_t>> const&>(_deps_new);
}

memory& dep_memory(size_t index) const {
if (!dependencies_new().empty()) {
auto dep = dependencies_new().at(index);
return dep.first->output_memory(dep.second);
}
return dependencies().at(index)->output_memory();
}
memory::ptr dep_memory_ptr(size_t index) const {
if (!dependencies_new().empty()) {
auto dep = dependencies_new().at(index);
return dep.first->output_memory_ptr(dep.second);
}
return dependencies().at(index)->output_memory_ptr();
}
memory& output_memory(size_t index = 0) const { return *_outputs[index]; }
memory::ptr output_memory_ptr(size_t index = 0) const { return _outputs[index]; }
size_t inputs_memory_count() const { return _node.get_primitive()->input_size(); }
size_t outputs_memory_count() const { return _node.get_primitive()->output_size(); }
bool outputs_allocated() const {
if (_outputs.empty()) return false;
for (const auto& output : _outputs) {
if (!output) return false;
}
return true;
}
primitive_type_id type() const { return _node.type(); }
primitive_id id() const { return _node.id(); }
primitive_id org_id() const { return _node.get_org_primitive_id(); }
Expand All @@ -92,7 +116,7 @@ class primitive_inst {
program_node const& get_node() const { return _node; }
network& get_network() const { return _network; }
uint32_t get_network_id() const;
virtual void set_output_memory(memory::ptr mem, bool check = true);
virtual void set_output_memory(memory::ptr mem, bool check = true, size_t idx = 0);
void check_memory_to_set(const memory& mem, const layout& layout) const;
const std::list<const cldnn::program_node *>& get_users() const { return _node.get_users(); }

Expand Down Expand Up @@ -183,6 +207,7 @@ class primitive_inst {
// this is a set of dependencies in terms of memory, if execution of this primitive requires data from another one,
// it should be added to this set
std::vector<std::shared_ptr<primitive_inst>> _deps;
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> _deps_new;

// this is a set of dependencies in terms of execution
// execution of all primitives from this set should be enough to guarantee that all memory deps (see _deps)
Expand All @@ -201,7 +226,7 @@ class primitive_inst {
// _output is optional because its initialization might be postponed (reshape_inst may either allocate it's own
// buffer or attach input as output
// depending on reshape_node.is_in_place())
memory::ptr _output;
std::vector<memory::ptr> _outputs;

std::vector<memory::cptr> _intermediates_memory;

Expand All @@ -215,7 +240,7 @@ class primitive_inst {

size_t max_output_layout_size = 0;

memory::ptr allocate_output();
std::vector<memory::ptr> allocate_outputs();
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
std::vector<std::shared_ptr<primitive_inst>> const& mem_deps);

Expand Down Expand Up @@ -333,7 +358,7 @@ class typed_primitive_inst_base : public primitive_inst {

typed_primitive_inst_base(network& network, typed_node const& node, memory::ptr buffer)
: typed_primitive_inst_base(network, node, false) {
_output = buffer;
_outputs[0] = buffer;
}

private:
Expand Down
6 changes: 6 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ struct program_node {
impl_types get_preferred_impl_type() const { return impl_type; }

std::vector<program_node*> const& get_dependencies() const { return dependencies; }
std::vector<std::pair<program_node*, int>> const& get_dependencies_new() const { return dependencies_new; }
program_node& get_dependency(size_t idx) const { return *dependencies.at(idx); }
std::pair<program_node*, int32_t> get_dependency_new(size_t idx) const { return dependencies_new.at(idx); }

std::vector<layout> const get_input_layouts() const {
std::vector<layout> layouts;
Expand Down Expand Up @@ -247,6 +249,8 @@ struct program_node {
// invalidate_users_if_changed is set to true returns whether output layout has changed
bool set_output_layout(layout& new_layout, bool invalidate_users_if_changed = true);

size_t get_outputs_count() const { return num_outputs; }

// forces recalculation of cached output layout, invalidates users if new layout is different than previous one and
// @p invalidate_users_if_changed is set to true returns whether output layout has changed
bool recalc_output_layout(bool invalidate_users_if_changed = true);
Expand Down Expand Up @@ -437,6 +441,7 @@ struct program_node {
format::type required_output;

std::vector<program_node*> dependencies;
std::vector<std::pair<program_node*, int>> dependencies_new;
std::list<program_node*> users;

// list of primitives that can reuse same memory buffers due to execution order conflicts
Expand Down Expand Up @@ -490,6 +495,7 @@ struct program_node {
bool has_out_scales(const std::shared_ptr<dnnl::primitive_attr>& attr);
dnnl::post_ops try_optimize_post_ops(dnnl::post_ops& p_ops, const std::shared_ptr<dnnl::primitive_attr>& attr, bool& optimization_is_completed);
#endif // ENABLE_ONEDNN_FOR_GPU
size_t num_outputs = 1;
};

/*
Expand Down
9 changes: 5 additions & 4 deletions src/plugins/intel_gpu/src/graph/input_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,11 @@ void input_layout_inst::set_data(memory::ptr mem) {
check_memory_to_set(*mem, ol);

if (mem->is_allocated_by(get_network().get_engine())) {
_output = mem;
OPENVINO_ASSERT(!_outputs.empty(), "[GPU] Can't set data for empty input memory");
_outputs[0] = mem;
} else {
mem_lock<char, mem_lock_type::read> src(mem, get_network().get_stream());
mem_lock<char, mem_lock_type::write> dst(_output, get_network().get_stream());
mem_lock<char, mem_lock_type::write> dst(_outputs[0], get_network().get_stream());
std::copy(src.begin(), src.end(), dst.begin());
}

Expand All @@ -58,8 +59,8 @@ void input_layout_inst::set_data(memory::ptr mem) {
}

void input_layout_inst::update_shape() {
OPENVINO_ASSERT(_output != nullptr, "[GPU] input memory is not set");
auto mem_layout = _output->get_layout();
OPENVINO_ASSERT(!_outputs.empty() && _outputs[0] != nullptr, "[GPU] input memory is not set");
auto mem_layout = _outputs[0]->get_layout();
if (_impl_params->output_layout != mem_layout) {
set_shape_change();
}
Expand Down
Loading

0 comments on commit 550e590

Please sign in to comment.