From c5886d926cac4abb6eb0ec328d78f5cbd42c8f4c Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 30 Apr 2024 14:54:10 +0800 Subject: [PATCH 1/2] crimson/osd/osd_operations/background_recovery: mark PGs as unfound when necessary Signed-off-by: Xuehan Xu --- .../osd/osd_operations/background_recovery.cc | 23 ++++++++----- src/crimson/osd/pg.cc | 34 +++++++++++++++++++ src/crimson/osd/pg.h | 7 ++-- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/src/crimson/osd/osd_operations/background_recovery.cc b/src/crimson/osd/osd_operations/background_recovery.cc index f74933ec26600..ac94ea7eb8894 100644 --- a/src/crimson/osd/osd_operations/background_recovery.cc +++ b/src/crimson/osd/osd_operations/background_recovery.cc @@ -116,15 +116,19 @@ UrgentRecovery::do_recovery() { LOG_PREFIX(UrgentRecovery::do_recovery); DEBUGDPPI("{}: {}", *pg, __func__, *this); - if (!pg->has_reset_since(epoch_started)) { + if (pg->has_reset_since(epoch_started)) { + return seastar::make_ready_future(false); + } + + return pg->find_unfound(epoch_started + ).then_interruptible([this] { return with_blocking_event([this] (auto&& trigger) { return pg->get_recovery_handler()->recover_missing(trigger, soid, need); }).then_interruptible([] { return seastar::make_ready_future(false); }); - } - return seastar::make_ready_future(false); + }); } void UrgentRecovery::print(std::ostream &lhs) const @@ -164,11 +168,14 @@ PglogBasedRecovery::do_recovery() if (pg->has_reset_since(epoch_started)) { return seastar::make_ready_future(false); } - return with_blocking_event([this] (auto&& trigger) { - return pg->get_recovery_handler()->start_recovery_ops( - trigger, - crimson::common::local_conf()->osd_recovery_max_single_start); + return pg->find_unfound(epoch_started + ).then_interruptible([this] { + return with_blocking_event([this] (auto&& trigger) { + return pg->get_recovery_handler()->start_recovery_ops( + trigger, + crimson::common::local_conf()->osd_recovery_max_single_start); + }); }); } diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc index 06749df2af636..dd2f49744128a 100644 --- a/src/crimson/osd/pg.cc +++ b/src/crimson/osd/pg.cc @@ -244,6 +244,40 @@ void PG::queue_check_readable(epoch_t last_peering_reset, ceph::timespan delay) std::chrono::duration_cast(delay)); } +PG::interruptible_future<> PG::find_unfound(epoch_t epoch_started) +{ + if (!have_unfound()) { + return interruptor::now(); + } + PeeringCtx rctx; + if (!peering_state.discover_all_missing(rctx)) { + if (peering_state.state_test(PG_STATE_BACKFILLING)) { + logger().debug( + "{} {} no luck, giving up on this pg for now (in backfill)", + *this, __func__); + std::ignore = get_shard_services().start_operation( + this, + get_pg_whoami(), + get_pgid(), + epoch_started, + epoch_started, + PeeringState::UnfoundBackfill()); + } else if (peering_state.state_test(PG_STATE_RECOVERING)) { + logger().debug( + "{} {} no luck, giving up on this pg for now (in recovery)", + *this, __func__); + std::ignore = get_shard_services().start_operation( + this, + get_pg_whoami(), + get_pgid(), + epoch_started, + epoch_started, + PeeringState::UnfoundRecovery()); + } + } + return get_shard_services().dispatch_context(get_collection_ref(), std::move(rctx)); +} + void PG::recheck_readable() { bool changed = false; diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h index 9f49422bd1d06..bf853969acda8 100644 --- a/src/crimson/osd/pg.h +++ b/src/crimson/osd/pg.h @@ -734,6 +734,10 @@ class PG : public boost::intrusive_ref_counter< // TODO: see PrimaryLogPG::mark_all_unfound_lost() return seastar::now(); } + interruptible_future<> find_unfound(epoch_t epoch_started); + bool have_unfound() const { + return peering_state.have_unfound(); + } bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch) const; @@ -766,9 +770,6 @@ class PG : public boost::intrusive_ref_counter< friend class SnapTrimEvent; friend class SnapTrimObjSubEvent; private: - seastar::future find_unfound() { - return seastar::make_ready_future(true); - } bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const; bool can_discard_op(const MOSDOp& m) const; From 3d03ac21bd8de591e4d340e274863bca9d31b170 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 30 Apr 2024 14:55:45 +0800 Subject: [PATCH 2/2] crimson/osd/pg_recovery: skip unfound objects when recovering the primary Fixes: https://tracker.ceph.com/issues/65696 Signed-off-by: Xuehan Xu --- src/crimson/osd/pg_recovery.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc index 13ac069c63d41..f9c2b02997db1 100644 --- a/src/crimson/osd/pg_recovery.cc +++ b/src/crimson/osd/pg_recovery.cc @@ -135,11 +135,23 @@ size_t PGRecovery::start_primary_recovery_ops( } else { soid = p->second; } - const pg_missing_item& item = missing.get_items().find(p->second)->second; - ++p; hobject_t head = soid.get_head(); + if (pg->get_peering_state().get_missing_loc().is_unfound(soid)) { + logger().debug("{}: object {} unfound", __func__, soid); + ++skipped; + continue; + } + if (pg->get_peering_state().get_missing_loc().is_unfound(head)) { + logger().debug("{}: head object {} unfound", __func__, soid); + ++skipped; + continue; + } + + const pg_missing_item& item = missing.get_items().find(p->second)->second; + ++p; + bool head_missing = missing.is_missing(head); logger().info( "{} {} item.need {} {} {} {} {}",