diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 42df3fb69b948..2f903a19bab9f 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2639,6 +2639,12 @@ void MDSRankDispatcher::handle_asok_command( })); return; } else if (command == "scrub abort") { + if (whoami != 0) { + *css << "Not rank 0"; + r = -EXDEV; + goto out; + } + finisher->queue( new LambdaContext( [this, on_finish, f](int r) { @@ -2655,6 +2661,12 @@ void MDSRankDispatcher::handle_asok_command( })); return; } else if (command == "scrub pause") { + if (whoami != 0) { + *css << "Not rank 0"; + r = -EXDEV; + goto out; + } + finisher->queue( new LambdaContext( [this, on_finish, f](int r) { @@ -2671,6 +2683,11 @@ void MDSRankDispatcher::handle_asok_command( })); return; } else if (command == "scrub resume") { + if (whoami != 0) { + *css << "Not rank 0"; + r = -EXDEV; + goto out; + } command_scrub_resume(f); } else if (command == "scrub status") { command_scrub_status(f); diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc index 505ff98531dde..5ebd140e957bb 100644 --- a/src/mds/ScrubStack.cc +++ b/src/mds/ScrubStack.cc @@ -691,56 +691,83 @@ void ScrubStack::abort_pending_scrubs() { clear_stack = false; } +void ScrubStack::send_state_message(int op) { + MDSRank *mds = mdcache->mds; + set up_mds; + mds->get_mds_map()->get_up_mds_set(up_mds); + for (auto& r : up_mds) { + if (r == 0) + continue; + auto m = make_message(op); + mds->send_message_mds(m, r); + } +} + void ScrubStack::scrub_abort(Context *on_finish) { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); - ceph_assert(on_finish != nullptr); dout(10) << __func__ << ": aborting with " << scrubs_in_progress << " scrubs in progress and " << stack_size << " in the" << " stack" << dendl; + if (mdcache->mds->get_nodeid() == 0) { + scrub_epoch_last_abort = scrub_epoch; + scrub_any_peer_aborting = true; + send_state_message(MMDSScrub::OP_ABORT); + } + clear_stack = true; if (scrub_in_transition_state()) { - control_ctxs.push_back(on_finish); + if (on_finish) + control_ctxs.push_back(on_finish); return; } abort_pending_scrubs(); - if (state != STATE_PAUSED) { + if (state != STATE_PAUSED) set_state(STATE_IDLE); - } - on_finish->complete(0); + + if (on_finish) + on_finish->complete(0); } void ScrubStack::scrub_pause(Context *on_finish) { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); - ceph_assert(on_finish != nullptr); dout(10) << __func__ << ": pausing with " << scrubs_in_progress << " scrubs in progress and " << stack_size << " in the" << " stack" << dendl; + if (mdcache->mds->get_nodeid() == 0) + send_state_message(MMDSScrub::OP_PAUSE); + // abort is in progress if (clear_stack) { - on_finish->complete(-EINVAL); + if (on_finish) + on_finish->complete(-EINVAL); return; } bool done = scrub_in_transition_state(); if (done) { set_state(STATE_PAUSING); - control_ctxs.push_back(on_finish); + if (on_finish) + control_ctxs.push_back(on_finish); return; } set_state(STATE_PAUSED); - on_finish->complete(0); + if (on_finish) + on_finish->complete(0); } bool ScrubStack::scrub_resume() { ceph_assert(ceph_mutex_is_locked_by_me(mdcache->mds->mds_lock)); dout(20) << __func__ << ": state=" << state << dendl; + if (mdcache->mds->get_nodeid() == 0) + send_state_message(MMDSScrub::OP_RESUME); + int r = 0; if (clear_stack) { @@ -925,6 +952,15 @@ void ScrubStack::handle_scrub(const cref_t &m) } } break; + case MMDSScrub::OP_ABORT: + scrub_abort(nullptr); + break; + case MMDSScrub::OP_PAUSE: + scrub_pause(nullptr); + break; + case MMDSScrub::OP_RESUME: + scrub_resume(); + break; default: derr << " scrub stack unknown scrub operation " << m->get_op() << dendl_impl; ceph_abort_msg("scrub stack unknown scrub operation"); @@ -965,7 +1001,8 @@ void ScrubStack::handle_scrub_stats(const cref_t &m) scrub_epoch = m->get_epoch(); - auto ack = make_message(scrub_epoch, std::move(scrubbing_tags)); + auto ack = make_message(scrub_epoch, + std::move(scrubbing_tags), clear_stack); mdcache->mds->send_message_mds(ack, 0); if (any_finished) @@ -978,13 +1015,14 @@ void ScrubStack::handle_scrub_stats(const cref_t &m) auto& stat = mds_scrub_stats[from]; stat.epoch_acked = m->get_epoch(); stat.scrubbing_tags = m->get_scrubbing_tags(); + stat.aborting = m->is_aborting(); } } } void ScrubStack::advance_scrub_status() { - if (scrubbing_map.empty()) + if (!scrub_any_peer_aborting && scrubbing_map.empty()) return; MDSRank *mds = mdcache->mds; @@ -998,16 +1036,22 @@ void ScrubStack::advance_scrub_status() if (up_max == 0) { update_scrubbing = true; + scrub_any_peer_aborting = false; } else if (mds_scrub_stats.size() > (size_t)(up_max)) { + bool any_aborting = false; bool fully_acked = true; for (const auto& stat : mds_scrub_stats) { + if (stat.aborting || stat.epoch_acked <= scrub_epoch_last_abort) + any_aborting = true; if (stat.epoch_acked != scrub_epoch) { fully_acked = false; - break; + continue; } scrubbing_tags.insert(stat.scrubbing_tags.begin(), stat.scrubbing_tags.end()); } + if (!any_aborting) + scrub_any_peer_aborting = false; if (fully_acked) { // handle_scrub_stats() reports scrub is still in-progress if it has // forwarded any object to other mds since previous epoch. Let's assume, @@ -1064,6 +1108,11 @@ void ScrubStack::advance_scrub_status() void ScrubStack::handle_mds_failure(mds_rank_t mds) { + if (mds == 0) { + scrub_abort(nullptr); + return; + } + bool kick = false; for (auto it = remote_scrubs.begin(); it != remote_scrubs.end(); ) { if (it->second.gather_set.erase(mds) && diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h index a4488ff535379..24d19168e889f 100644 --- a/src/mds/ScrubStack.h +++ b/src/mds/ScrubStack.h @@ -126,10 +126,14 @@ class ScrubStack { unsigned scrub_epoch = 2; unsigned scrub_epoch_fully_acked = 0; + unsigned scrub_epoch_last_abort = 2; + // check if any mds is aborting scrub after mds.0 starts + bool scrub_any_peer_aborting = true; struct scrub_stat_t { unsigned epoch_acked = 0; std::set scrubbing_tags; + bool aborting = false; }; std::vector mds_scrub_stats; @@ -231,6 +235,11 @@ class ScrubStack { */ void complete_control_contexts(int r); + /** + * ask peer mds (rank > 0) to abort/pause/resume scrubs + */ + void send_state_message(int op); + /** * Abort pending scrubs for inodes waiting in the inode stack. * Completion context is complete with -ECANCELED. diff --git a/src/messages/MMDSScrub.h b/src/messages/MMDSScrub.h index 9988bebfd1c03..b4f4ce55b82e6 100644 --- a/src/messages/MMDSScrub.h +++ b/src/messages/MMDSScrub.h @@ -26,6 +26,9 @@ class MMDSScrub : public MMDSOp { static constexpr int OP_QUEUEDIR_ACK = -1; static constexpr int OP_QUEUEINO = 2; static constexpr int OP_QUEUEINO_ACK = -2; + static constexpr int OP_ABORT = 3; + static constexpr int OP_PAUSE = 4; + static constexpr int OP_RESUME = 5; static const char *get_opname(int o) { switch (o) { @@ -33,6 +36,9 @@ class MMDSScrub : public MMDSOp { case OP_QUEUEDIR_ACK: return "queue_dir_ack"; case OP_QUEUEINO: return "queue_ino"; case OP_QUEUEINO_ACK: return "queue_ino_ack"; + case OP_ABORT: return "abort"; + case OP_PAUSE: return "pause"; + case OP_RESUME: return "resume"; default: ceph_abort(); return nullptr; } } @@ -99,6 +105,8 @@ class MMDSScrub : public MMDSOp { static constexpr int COMPAT_VERSION = 1; MMDSScrub() : MMDSOp(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION) {} + MMDSScrub(int o) + : MMDSOp(MSG_MDS_SCRUB, HEAD_VERSION, COMPAT_VERSION), op(o) {} MMDSScrub(int o, inodeno_t i, fragset_t&& _frags, std::string_view _tag, inodeno_t _origin=inodeno_t(), bool internal_tag=false, bool force=false, bool recursive=false, bool repair=false) diff --git a/src/messages/MMDSScrubStats.h b/src/messages/MMDSScrubStats.h index 84fb5cdb8a2b2..41c403adea751 100644 --- a/src/messages/MMDSScrubStats.h +++ b/src/messages/MMDSScrubStats.h @@ -26,13 +26,15 @@ class MMDSScrubStats : public MMDSOp { void print(ostream& o) const override { o << "mds_scrub_stats(e" << epoch; if (update_scrubbing) - o << " [" << scrubbing_tags << "])"; - else - o << ")"; + o << " [" << scrubbing_tags << "]"; + if (aborting) + o << " aborting"; + o << ")"; } unsigned get_epoch() const { return epoch; } const auto& get_scrubbing_tags() const { return scrubbing_tags; } + bool is_aborting() const { return aborting; } bool is_finished(const std::string& tag) const { return update_scrubbing && !scrubbing_tags.count(tag); } @@ -42,6 +44,7 @@ class MMDSScrubStats : public MMDSOp { encode(epoch, payload); encode(scrubbing_tags, payload); encode(update_scrubbing, payload); + encode(aborting, payload); } void decode_payload() override { using ceph::decode; @@ -49,24 +52,26 @@ class MMDSScrubStats : public MMDSOp { decode(epoch, p); decode(scrubbing_tags, p); decode(update_scrubbing, p); + decode(aborting, p); } protected: MMDSScrubStats(unsigned e=0) : MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), epoch(e) {} - MMDSScrubStats(unsigned e, std::set&& tags) : + MMDSScrubStats(unsigned e, std::set&& tags, bool abrt=false) : MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), - epoch(e), scrubbing_tags(std::move(tags)), update_scrubbing(true) {} - MMDSScrubStats(unsigned e, const std::set& tags) : + epoch(e), scrubbing_tags(std::move(tags)), update_scrubbing(true), aborting(abrt) {} + MMDSScrubStats(unsigned e, const std::set& tags, bool abrt=false) : MMDSOp(MSG_MDS_SCRUB_STATS, HEAD_VERSION, COMPAT_VERSION), - epoch(e), scrubbing_tags(tags), update_scrubbing(true) {} + epoch(e), scrubbing_tags(tags), update_scrubbing(true), aborting(abrt) {} ~MMDSScrubStats() override {} private: unsigned epoch; std::set scrubbing_tags; bool update_scrubbing = false; + bool aborting = false; template friend boost::intrusive_ptr ceph::make_message(Args&&... args);