src/schedule.cpp

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
#include <migraphx/schedule.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/iterator.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/par_for.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/dom_info.hpp>
#include <unordered_map>
#include <unordered_set>
#include <queue>
#include <thread>
#include <mutex>
#include <migraphx/make_op.hpp>

#include <set>
#include <deque>
#include <chrono>
#include <iomanip>

namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {

MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_SCHEDULE)

auto get_inputs()
{
    return [](auto i) { return i->inputs(); };
}

auto get_outputs()
{
    return [](auto i) { return i->outputs(); };
}

struct stream_info
{
    std::unordered_map<instruction_ref, std::size_t> ins2stream;
    std::unordered_map<instruction_ref, std::size_t> weights;
    std::unordered_map<instruction_ref, std::size_t> iweights;
    ins_dep_map mod_implicit_deps;

    void calc_implicit_deps(const module& m) { mod_implicit_deps = m.calc_implicit_deps(); }

    void accumulate_weights(instruction_ref last, const schedule_model& model)
    {
        fix<std::size_t>([&](auto self, auto ins) -> std::size_t {
            if(not contains(weights, ins))
            {
                std::size_t weight = 0;
                auto&& op          = ins->get_operator();
                if(not is_context_free(op) and op.name()[0] != '@')
                    weight = model.weight(op);
                // This will ensure a stream will be assigned to return
                if(op.name() == "@return")
                    weight = 1;
                iweights[ins] = weight;
                auto inputs   = ins->inputs();
                if(contains(mod_implicit_deps, ins))
                {
                    const auto& impl_deps = mod_implicit_deps.at(ins);
                    inputs.insert(inputs.end(), impl_deps.begin(), impl_deps.end());
                }

                weights[ins] = std::accumulate(
                    inputs.begin(), inputs.end(), weight, [&](std::size_t w, instruction_ref i) {
                        return w + self(i);
                    });
            }
            return weights[ins];
        })(last);
    }

    template <class Compare>
    void sort_args_by_weight(std::vector<instruction_ref>& args, Compare compare) const
    {
        if(args.size() < 2)
            return;
        std::sort(args.begin(), args.end(), by(compare, [this](auto x) {
                      return std::make_tuple(
                          this->weights.at(x), x->inputs().size(), std::addressof(*x));
                  }));
    }

    std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
    {
        if(args.size() < 2)
        {
            return args.end();
        }

        const std::size_t min_partition_threshold = 2;
        sort_args_by_weight(args, std::greater<>{});

        auto it = std::lower_bound(std::next(args.begin()),
                                   args.end(),
                                   min_partition_threshold,
                                   [&](auto i, std::size_t w) { return this->weights[i] > w; });
        assert(it == args.end() or this->weights[*it] <= min_partition_threshold);
        assert(it == args.end() or std::prev(it) == args.begin() or
               this->weights[*std::prev(it)] > min_partition_threshold);
        return it;
    }

    struct partition
    {
        std::size_t weight = 0;
        std::vector<instruction_ref> instructions{};

        void add(instruction_ref ins, std::size_t w)
        {
            weight += w;
            instructions.push_back(ins);
        }
    };

    std::size_t assign_streams(module& m, std::size_t n)
    {
        assert(n > 0);
        partition critical;
        std::unordered_map<instruction_ref, std::deque<partition>> partitions;
        partitions.reserve(weights.size());
        fix([&](auto self, auto ins, auto& part) {
            assert(not is_end(ins, m.end()));
            if(not m.has_instruction(ins))
                return;
            if(contains(partitions, ins))
                return;

            // Add an entry so we know the instruction was visited
            partitions[ins];
            part.add(ins, this->iweights[ins]);

            auto args         = ins->inputs();
            auto threshold_it = this->sort_args(args);

            if(not args.empty())
            {
                assert(threshold_it != args.begin());
                self(args.front(), part);
                for(auto i : range(std::next(args.begin()), threshold_it))
                {
                    partitions[ins].emplace_back();
                    self(i, partitions[ins].back());
                }
                for(auto i : range(threshold_it, args.end()))
                {
                    self(i, part);
                }
            }
            // Sort instructions
            m.move_instruction(ins, m.end());
        })(std::prev(m.end()), critical);

        // Set the critical partition to stream 0
        set_stream(critical, 0);
        if(n == 1)
        {
            // Assign streams for the other partitions
            for(auto&& ins_part : partitions)
                for(auto&& part : ins_part.second)
                    set_stream(part, 0);
            return 1;
        }
        else
        {
            std::vector<std::size_t> streams(n - 1);
            // Assign streams for the other partitions
            for(auto&& ins_part : partitions)
            {
                std::sort(ins_part.second.begin(),
                          ins_part.second.end(),
                          by(std::greater<>{}, [](auto&& x) {
                              return std::make_tuple(x.weight, x.instructions.size());
                          }));
                for(auto&& part : ins_part.second)
                {
                    auto stream =
                        std::min_element(streams.begin(), streams.end()) - streams.begin();
                    set_stream(part, stream + 1);
                    streams[stream] += part.weight;
                }
            }
            return 1 + std::count_if(streams.begin(), streams.end(), [](auto x) { return x > 0; });
        }
    }

    using weight_ins = std::pair<std::size_t, instruction_ref>;
    struct compare_weight_ins
    {
        bool operator()(const weight_ins& x, const weight_ins& y) const
        {
            return std::make_pair(x.first, std::addressof(*x.second)) <
                   std::make_pair(y.first, std::addressof(*y.second));
        }
    };

    void sort(module& m, std::size_t)
    {
        std::set<weight_ins, compare_weight_ins> children;
        std::unordered_map<instruction_ref, std::size_t> visited;
        auto last      = std::prev(m.end());
        auto mw        = this->weights.at(last);
        auto nw        = mw / (m.size() + 1);
        auto add_child = [&](auto ins) {
            auto x  = 1 + (mw - this->weights.at(ins)) / (nw + 1);
            auto w  = x * this->iweights.at(ins);
            auto& v = visited[ins];
            auto it = children.find(std::make_pair(v * w, ins));
            if(it == children.end())
            {
                v++;
                children.insert(std::make_pair(v * w, ins));
            }
        };
        add_child(last);

        while(not children.empty())
        {
            // Pop the first element
            auto top = children.begin()->second;
            children.erase(children.begin());
            m.move_instruction(top, m.begin());
            for(auto ins : top->inputs())
            {
                if(not m.has_instruction(ins))
                    continue;
                add_child(ins);
            }

            if(contains(mod_implicit_deps, top))
            {
                for(auto ins : mod_implicit_deps.at(top))
                {
                    assert(m.has_instruction(ins));
                    add_child(ins);
                }
            }
        }

        // move dangling parameter to the front so as not be removed
        auto ins = std::next(last);
        while(ins != m.end())
        {
            auto next = std::next(ins);
            if(ins->name() == "@param")
            {
                m.move_instruction(ins, m.begin());
            }
            ins = next;
        }
    }

    void set_stream(const partition& p, std::size_t n)
    {
        for(auto ins : p.instructions)
            if(iweights[ins] > 0)
                set_stream(ins, n);
    }

    void set_stream(instruction_ref ins, std::size_t n)
    {
        assert(iweights[ins] > 0);
        ins2stream[ins] = n;
    }

    std::size_t get_stream(instruction_ref ins) const { return ins2stream.at(ins); }

    bool has_stream(instruction_ref ins) const { return contains(ins2stream, ins); }

    template <class F>
    bool different(F f, std::size_t stream) const
    {
        bool result = false;
        f([&](auto s) {
            if(s != stream)
            {
                result = true;
                return false;
            }
            // cppcheck-suppress uselessAssignmentArg
            stream = s;
            return true;
        });
        return result;
    }

    template <class F>
    bool different(F f) const
    {
        bool result = false;
        f([&](auto s) {
            result = this->different(f, s);
            return false;
        });
        return result;
    }

    template <class Selector>
    auto get_streams_from(instruction_ref start, Selector select) const
    {
        return [=](auto f) {
            return fix<bool>([&](auto self, auto ins) {
                return all_of(select(ins), [&](auto i) {
                    if(has_stream(i))
                        return f(this->get_stream(i));
                    else
                        return self(i);
                });
            })(start);
        };
    }

    std::unordered_set<std::size_t> get_streams(instruction_ref ins) const
    {
        if(has_stream(ins))
            return {get_stream(ins)};
        std::unordered_set<std::size_t> result;
        get_streams_from(ins, get_inputs())([&](auto s) {
            result.insert(s);
            return true;
        });
        return result;
    }

    template <class... Ts>
    bool is_merge_point(instruction_ref ins, Ts... xs) const
    {
        return different(get_streams_from(ins, get_inputs()), xs...);
    }

    template <class... Ts>
    bool is_split_point(instruction_ref ins, Ts... xs) const
    {
        return different(get_streams_from(ins, get_outputs()), xs...);
    }

    std::vector<instruction_ref> get_recorded_instructions(instruction_ref start)
    {
        std::vector<instruction_ref> result;
        std::unordered_map<std::size_t, instruction_ref> m;
        fix([&](auto self, auto ins) {
            for(auto i : ins->inputs())
            {
                if(iweights.at(i) == 0)
                {
                    self(i);
                    continue;
                }
                auto stream = this->get_stream(i);
                if(not contains(m, stream))
                    m[stream] = i;
                else
                    m[stream] = std::min(m[stream], i, by(std::less<>{}, [&](auto x) {
                                             return std::distance(x, start);
                                         }));
            }
        })(start);
        std::transform(
            m.begin(), m.end(), std::back_inserter(result), [](auto&& p) { return p.second; });
        return result;
    }

    std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
    find_concurrent_instructions(module& m) const
    {
        std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
        dominator_info di = compute_dominator(m);
        result.reserve(m.size());
        merge_from.reserve(m.size());
        for(auto ins : reverse_iterator_for(m))
        {
            for(auto&& arg : ins->outputs())
            {
                if(not m.has_instruction(arg))
                    continue;
                if(is_merge_point(arg))
                    merge_from[ins].insert(arg);
                merge_from[ins].insert(merge_from[arg].begin(), merge_from[arg].end());
            }

            if(is_split_point(ins))
            {
                erase_if(merge_from[ins],
                         [&](auto merge) { return di.strictly_dominate(ins, merge); });
            }

            auto streams = this->get_streams(ins);
            // Collect concur instructions for each merge point.
            for(const auto& merge : merge_from[ins])
            {
                for(auto stream : streams)
                {
                    if(result[merge].size() <= stream)
                        result[merge].resize(stream + 1);
                    auto&& r = result[merge][stream];
                    r.push_back(ins);
                    // Copy inputs if they dont have a stream(and are not a builtin and context
                    // free). Inputs without a stream can have a implicit dependency
                    std::copy_if(ins->inputs().begin(),
                                 ins->inputs().end(),
                                 std::back_inserter(r),
                                 [&](auto x) {
                                     return not this->has_stream(x) and
                                            not is_context_free(x->get_operator()) and
                                            x->name().front() != '@';
                                 });
                }
            }
        }
        return result;
    }

    std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
    get_conflicts(module& m)
    {

        using conflict_table_type =
            std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>;
        conflict_table_type conflict_table;
        auto concur_ins = this->find_concurrent_instructions(m);

        // Compute an index for each instruction
        std::unordered_map<instruction_ref, std::size_t> ins2index;
        std::size_t index_total = 0;
        for(auto ins : iterator_for(m))
            ins2index[ins] = index_total++;

        std::vector<conflict_table_type> thread_conflict_tables(
            std::thread::hardware_concurrency());
        std::vector<instruction_ref> index_to_ins;
        index_to_ins.reserve(concur_ins.size());
        std::transform(concur_ins.begin(),
                       concur_ins.end(),
                       std::back_inserter(index_to_ins),
                       [](auto&& it) { return it.first; });

        par_for(concur_ins.size(), [&](auto ins_index, auto tid) {
            auto merge_first = index_to_ins[ins_index];
            assert(concur_ins.count(merge_first) > 0);
            auto& merge_second = concur_ins.at(merge_first);

            // ensure there are enough elements for different threads
            assert(tid < thread_conflict_tables.size());
            auto& thrd_table = thread_conflict_tables.at(tid);

            std::unordered_set<instruction_ref> checked_ins_set;
            auto range_i = range(merge_second.begin(), std::prev(merge_second.end()));
            for(auto it_i : iterator_for(range_i))
            {
                std::unordered_set<instruction_ref> ins1_set;
                std::copy_if(it_i->begin(),
                             it_i->end(),
                             std::inserter(ins1_set, ins1_set.end()),
                             [&](auto i) { return not contains(checked_ins_set, i); });
                checked_ins_set.insert(ins1_set.begin(), ins1_set.end());

                auto range_j = range(std::next(it_i), merge_second.end());
                std::unordered_set<instruction_ref> ins2_set;
                for(auto it_j : iterator_for(range_j))
                {
                    std::copy_if(it_j->begin(),
                                 it_j->end(),
                                 std::inserter(ins2_set, ins2_set.end()),
                                 [&](auto i) { return not contains(checked_ins_set, i); });
                }

                for(auto ins1 : ins1_set)
                {
                    auto p1 = ins2index.at(ins1);
                    for(auto ins2 : ins2_set)
                    {
                        if(ins1 == ins2)
                            continue;
                        auto p2 = ins2index.at(ins2);
                        if(p2 > p1)
                            thrd_table[ins2].insert(ins1);
                        else
                            thrd_table[ins1].insert(ins2);
                    }
                }
            }
        });

        // merge thread_conflict_tables together
        for(auto& tbl : thread_conflict_tables)
        {
            for(auto& it : tbl)
            {
                conflict_table[it.first].insert(it.second.begin(), it.second.end());
            }
        }

        // Remove instructions from the conflict table of an ealier instruction
        for(auto&& ip : conflict_table)
        {
            auto ins1 = ip.first;
            for(auto ins2 : ip.second)
                if(contains(conflict_table[ins2], ins1))
                    conflict_table[ins2].erase(ins1);
        }

        return conflict_table;
    }
};

void schedule::apply(module& m) const
{
    if(not enable)
        return;

    stream_info si;
    si.calc_implicit_deps(m);
    auto last = std::prev(m.end());
    si.accumulate_weights(last, model);
    auto nstreams = si.assign_streams(m, model.concurrency());
    si.sort(m, model.concurrency());

    if(enabled(MIGRAPHX_TRACE_COMPILE{}) or enabled(MIGRAPHX_TRACE_SCHEDULE{}))
    {
        m.annotate(std::cout, [&](auto ins) {
            if(ins->name() == "@param" and not contains(si.weights, ins))
                return;

            std::cout << ":";
            std::cout << " weight=" << si.weights.at(ins);
            std::cout << " input={";
            si.get_streams_from(ins, get_inputs())([&](auto s) {
                std::cout << s << ",";
                return true;
            });
            std::cout << "}";
            if(si.has_stream(ins))
                std::cout << " stream=" << si.get_stream(ins);
        });
        std::cout << std::endl;
    }

    // No concurrency
    if(nstreams < 2)
        return;

    // Schedule instructions
    std::size_t wait_id = 0;
    std::unordered_map<instruction_ref, std::size_t> ins2wait;
    std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
    std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
    ins2wait.reserve(m.size());
    ins2waited.reserve(m.size());
    for(auto ins : iterator_for(m))
    {
        // Only schedule instructions that have a stream
        if(not si.has_stream(ins))
            continue;
        assert(si.weights[ins] > 0);
        // Schedule instruction on the stream
        auto stream = si.get_stream(ins);
        assert(stream < model.concurrency());
        model.sched(m, ins, stream);
        // Insert wait instructions
        if(si.is_merge_point(ins, stream))
        {
            for(auto i : si.get_recorded_instructions(ins))
            {
                if(not si.has_stream(i) or si.get_stream(i) == stream)
                    continue;

                // Create a new event if it hasn't been recorded
                if(not contains(ins2wait, i))
                {
                    ins2wait[i] = wait_id;
                    model.record(m, i, wait_id);
                    wait_id++;
                }
                auto w = ins2wait.at(i);
                // If we already waited for the event on this stream then dont
                // insert another wait event
                if(not contains(waited_for[stream], w))
                    model.wait(m, ins, w);
                // Store the event as waited
                waited_for[stream].insert(w);
                // Store all wait events that have been waited on prior to the recorded instruction
                waited_for[stream].insert(ins2waited[i].begin(), ins2waited[i].end());
            }
        }
        // Store wait events that have already been waited on
        if(si.is_split_point(ins, stream))
        {
            ins2waited[ins] = waited_for[stream];
        }
    }

    // Add memory conflicts
    auto conflict_table = si.get_conflicts(m);
    for(auto&& ip : conflict_table)
    {
        if(ip.second.empty())
            continue;
        std::vector<instruction_ref> args;
        args.push_back(ip.first);
        args.insert(args.end(), ip.second.begin(), ip.second.end());
        m.insert_instruction(std::next(ip.first), make_op("identity"), args);
    }
}

} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx