Skip to content

Commit

Permalink
Revert "osd: Try other PGs when reservation failures occur"
Browse files Browse the repository at this point in the history
This reverts commit 08c3ede.

Due to https://tracker.ceph.com/issues/49868
Should be reinstated once that bug is solved. See tracker comments for analysis
and suggested fixes.

Signed-off-by: Ronen Friedman <[email protected]>
  • Loading branch information
ronen-fr committed Apr 7, 2021
1 parent b8045f7 commit c57731d
Show file tree
Hide file tree
Showing 7 changed files with 2 additions and 54 deletions.
34 changes: 2 additions & 32 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7612,13 +7612,6 @@ void OSD::sched_scrub()
continue;
}

// If this one couldn't reserve, skip for now
if (pg->get_reserve_failed()) {
pg->unlock();
dout(20) << __func__ << " pg " << scrub_job.pgid << " reserve failed, skipped" << dendl;
continue;
}

// This has already started, so go on to the next scrub job
if (pg->is_scrub_active()) {
pg->unlock();
Expand All @@ -7638,7 +7631,7 @@ void OSD::sched_scrub()
if (pg->m_scrubber->is_reserving()) {
pg->unlock();
dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
goto out;
break;
}
dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
<< (pg->get_must_scrub() ? ", explicitly requested" :
Expand All @@ -7647,34 +7640,11 @@ void OSD::sched_scrub()
if (pg->sched_scrub()) {
pg->unlock();
dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
goto out;
}
// If this is set now we must have had a local reserve failure, so can't scrub anything right now
if (pg->get_reserve_failed()) {
pg->unlock();
dout(20) << __func__ << " pg " << scrub_job.pgid << " local reserve failed, nothing to be done now" << dendl;
goto out;
break;
}

pg->unlock();
} while (service.next_scrub_stamp(scrub_job, &scrub_job));

// Clear reserve_failed from all pending PGs, so we try again
if (service.first_scrub_stamp(&scrub_job)) {
do {
if (scrub_job.sched_time > now)
break;
PGRef pg = _lookup_lock_pg(scrub_job.pgid);
// If we can't lock, it's ok we can get it next time
if (!pg)
continue;
pg->clear_reserve_failed();
pg->unlock();
} while (service.next_scrub_stamp(scrub_job, &scrub_job));
}
}

out:
dout(20) << "sched_scrub done" << dendl;
}

Expand Down
1 change: 0 additions & 1 deletion src/osd/PG.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1360,7 +1360,6 @@ bool PG::sched_scrub()
// be retried by the OSD later on.
if (!m_scrubber->reserve_local()) {
dout(10) << __func__ << ": failed to reserve locally" << dendl;
set_reserve_failed();
return false;
}

Expand Down
4 changes: 0 additions & 4 deletions src/osd/PG.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,6 @@ class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
/// scrubbing state for both Primary & replicas
bool is_scrub_active() const { return m_scrubber->is_scrub_active(); }

bool get_reserve_failed() const { return m_scrubber->get_reserve_failed(); }
void set_reserve_failed() { m_scrubber->set_reserve_failed(); }
void clear_reserve_failed() { m_scrubber->clear_reserve_failed(); }

public:
// -- members --
const coll_t coll;
Expand Down
7 changes: 0 additions & 7 deletions src/osd/pg_scrubber.h
Original file line number Diff line number Diff line change
Expand Up @@ -412,10 +412,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {

[[nodiscard]] bool is_scrub_active() const final { return m_active; }

[[nodiscard]] bool get_reserve_failed() const final { return m_reserve_failed; }
void set_reserve_failed() final { m_reserve_failed = true; }
void clear_reserve_failed() final { m_reserve_failed = false; }

private:
void reset_internal_state();

Expand Down Expand Up @@ -540,9 +536,6 @@ class PgScrubber : public ScrubPgIF, public ScrubMachineListener {

bool m_active{false};

// This PG could not get all the scrub reservations
bool m_reserve_failed{false};

eversion_t m_subset_last_update{};

std::unique_ptr<Scrub::Store> m_store;
Expand Down
2 changes: 0 additions & 2 deletions src/osd/scrub_machine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,6 @@ sc::result ReservingReplicas::react(const ReservationFailure&)
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
dout(10) << "ReservingReplicas::react(const ReservationFailure&)" << dendl;

// Mark PG so that we will try other PGs, before coming back to this one
scrbr->set_reserve_failed();
// the Scrubber must release all resources and abort the scrubbing
scrbr->clear_pgscrub_state();
return transit<NotActive>();
Expand Down
4 changes: 0 additions & 4 deletions src/osd/scrub_machine_lstnr.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,6 @@ struct ScrubMachineListener {

virtual void unreserve_replicas() = 0;

[[nodiscard]] virtual bool get_reserve_failed() const = 0;
virtual void set_reserve_failed() = 0;
virtual void clear_reserve_failed() = 0;

/**
* the FSM interface into the "are we waiting for maps, either our own or from
* replicas" state.
Expand Down
4 changes: 0 additions & 4 deletions src/osd/scrubber_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,6 @@ struct ScrubPgIF {
*/
[[nodiscard]] virtual bool is_scrub_active() const = 0;

[[nodiscard]] virtual bool get_reserve_failed() const = 0;
virtual void set_reserve_failed() = 0;
virtual void clear_reserve_failed() = 0;

/// are we waiting for resource reservation grants form our replicas?
[[nodiscard]] virtual bool is_reserving() const = 0;

Expand Down

0 comments on commit c57731d

Please sign in to comment.