diff --git a/src/messages/MOSDPGInfo.h b/src/messages/MOSDPGInfo.h index a79a6e66c7a06..d39b05b679ba5 100644 --- a/src/messages/MOSDPGInfo.h +++ b/src/messages/MOSDPGInfo.h @@ -20,34 +20,75 @@ #include "osd/osd_types.h" class MOSDPGInfo : public Message { + static const int HEAD_VERSION = 2; + static const int COMPAT_VERSION = 1; + epoch_t epoch; public: - vector pg_info; + vector > pg_list; epoch_t get_epoch() { return epoch; } - MOSDPGInfo() : Message(MSG_OSD_PG_INFO) {} - MOSDPGInfo(version_t mv) : - Message(MSG_OSD_PG_INFO), - epoch(mv) { } + MOSDPGInfo() + : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION) {} + MOSDPGInfo(version_t mv) + : Message(MSG_OSD_PG_INFO, HEAD_VERSION, COMPAT_VERSION), + epoch(mv) { } private: ~MOSDPGInfo() {} public: const char *get_type_name() const { return "pg_info"; } void print(ostream& out) const { - out << "pg_info(" << pg_info.size() << " pgs e" << epoch << ")"; + out << "pg_info(" << pg_list.size() << " pgs e" << epoch << ":"; + + for (vector >::const_iterator i = pg_list.begin(); + i != pg_list.end(); + ++i) { + if (i != pg_list.begin()) + out << ","; + out << i->first.pgid; + if (i->second.size()) + out << "(" << i->second.size() << ")"; + } + + out << ")"; } void encode_payload(uint64_t features) { ::encode(epoch, payload); - ::encode(pg_info, payload); + + // v1 was vector + __u32 n = pg_list.size(); + ::encode(n, payload); + for (vector >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->first, payload); + + // v2 needs the pg_interval_map_t for each record + for (vector >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->second, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); ::decode(epoch, p); - ::decode(pg_info, p); + + // decode pg_info_t portion of the vector + __u32 n; + ::decode(n, p); + pg_list.resize(n); + for (unsigned i=0; i= 2) { + // get the pg_interval_map_t portion + for (unsigned i=0; i= 2) { ::decode(query_epoch, p); } + if (header.version >= 3) { + ::decode(past_intervals, p); + } } }; diff --git a/src/messages/MOSDPGNotify.h b/src/messages/MOSDPGNotify.h index 55aa1dc5ecd5c..a9ccaa610f9f8 100644 --- a/src/messages/MOSDPGNotify.h +++ b/src/messages/MOSDPGNotify.h @@ -25,7 +25,7 @@ class MOSDPGNotify : public Message { - static const int HEAD_VERSION = 2; + static const int HEAD_VERSION = 3; static const int COMPAT_VERSION = 1; epoch_t epoch; @@ -34,16 +34,16 @@ class MOSDPGNotify : public Message { /// query. This allows the recipient to disregard responses to old /// queries. epoch_t query_epoch; - vector pg_list; // pgid -> version + vector > pg_list; // pgid -> version public: version_t get_epoch() { return epoch; } - vector& get_pg_list() { return pg_list; } + vector >& get_pg_list() { return pg_list; } epoch_t get_query_epoch() { return query_epoch; } MOSDPGNotify() : Message(MSG_OSD_PG_NOTIFY, HEAD_VERSION, COMPAT_VERSION) { } - MOSDPGNotify(epoch_t e, vector& l, epoch_t query_epoch) + MOSDPGNotify(epoch_t e, vector >& l, epoch_t query_epoch) : Message(MSG_OSD_PG_NOTIFY, HEAD_VERSION, COMPAT_VERSION), epoch(e), query_epoch(query_epoch) { pg_list.swap(l); @@ -56,25 +56,53 @@ class MOSDPGNotify : public Message { void encode_payload(uint64_t features) { ::encode(epoch, payload); - ::encode(pg_list, payload); + + // v2 was vector + __u32 n = pg_list.size(); + ::encode(n, payload); + for (vector >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->first, payload); + ::encode(query_epoch, payload); + + // v3 needs the pg_interval_map_t for each record + for (vector >::iterator p = pg_list.begin(); + p != pg_list.end(); + p++) + ::encode(p->second, payload); } void decode_payload() { bufferlist::iterator p = payload.begin(); ::decode(epoch, p); - ::decode(pg_list, p); + + // decode pg_info_t portion of the vector + __u32 n; + ::decode(n, p); + pg_list.resize(n); + for (unsigned i=0; i= 2) { ::decode(query_epoch, p); } + if (header.version >= 3) { + // get the pg_interval_map_t portion + for (unsigned i=0; i::const_iterator i = pg_list.begin(); + for (vector >::const_iterator i = pg_list.begin(); i != pg_list.end(); ++i) { if (i != pg_list.begin()) out << ","; - out << i->pgid; + out << i->first.pgid; + if (i->second.size()) + out << "(" << i->second.size() << ")"; } out << " epoch " << epoch << " query_epoch " << query_epoch diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index f76caa97372d0..683569bbaa90a 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -29,7 +29,6 @@ #include "auth/KeyRing.h" #include "osd/osd_types.h" -#include "osd/PG.h" // yuck #include "common/config.h" #include diff --git a/src/mon/LogMonitor.cc b/src/mon/LogMonitor.cc index e2b4867b44a6f..2db044c6d2370 100644 --- a/src/mon/LogMonitor.cc +++ b/src/mon/LogMonitor.cc @@ -24,7 +24,6 @@ #include "common/Timer.h" #include "osd/osd_types.h" -#include "osd/PG.h" // yuck #include "common/config.h" #include diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index acfbacff0b70b..2c8c8d548b5e2 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -36,7 +36,6 @@ #include "common/perf_counters.h" #include "osd/osd_types.h" -#include "osd/PG.h" // yuck #include "common/config.h" #include "common/errno.h" diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index b1d3207862060..6e3aa4bb257bd 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -1113,6 +1113,7 @@ PG *OSD::_open_lock_pg(pg_t pgid, bool no_lockdep_check, bool hold_map_lock) PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, int role, vector& up, vector& acting, pg_history_t history, + pg_interval_map_t& pi, ObjectStore::Transaction& t) { assert(osd_lock.is_locked()); @@ -1132,7 +1133,7 @@ PG *OSD::_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, history.last_epoch_started = history.epoch_created - 1; } - pg->init(role, up, acting, history, &t); + pg->init(role, up, acting, history, pi, &t); dout(7) << "_create_lock_pg " << *pg << dendl; return pg; @@ -1244,8 +1245,8 @@ void OSD::load_pgs() * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping * hasn't changed since the given epoch and we are the primary. */ -PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& created, - bool primary, +PG *OSD::get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi, + epoch_t epoch, int from, int& created, bool primary, ObjectStore::Transaction **pt, C_Contexts **pfin) { @@ -1294,11 +1295,11 @@ PG *OSD::get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& c // ok, create PG locally using provided Info and History *pt = new ObjectStore::Transaction; *pfin = new C_Contexts(g_ceph_context); - pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, **pt); + pg = _create_lock_pg(info.pgid, create, false, role, up, acting, history, pi, **pt); created++; dout(10) << *pg << " is new" << dendl; - + // kick any waiters wake_pg_waiters(pg->info.pgid); @@ -2894,6 +2895,10 @@ void OSD::handle_scrub(MOSDScrub *m) bool OSD::scrub_should_schedule() { double loadavgs[1]; + + if (!is_active()) + return false; + if (getloadavg(loadavgs, 1) != 1) { dout(10) << "scrub_should_schedule couldn't read loadavgs\n" << dendl; return false; @@ -3255,6 +3260,14 @@ void OSD::handle_osd_map(MOSDMap *m) C_Contexts *fin = new C_Contexts(g_ceph_context); + // lock all pgs + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + PG *pg = i->second; + pg->lock_with_map_lock_held(true); + } + // advance through the new maps for (epoch_t cur = start; cur <= superblock.newest_map; cur++) { dout(10) << " advance to epoch " << cur << " (<= newest " << superblock.newest_map << ")" << dendl; @@ -3291,6 +3304,16 @@ void OSD::handle_osd_map(MOSDMap *m) activate_map(t, fin->contexts); } + // write and unlock pgs + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + PG *pg = i->second; + //pg->lock_with_map_lock_held(); + pg->write_if_dirty(t); + pg->unlock(); + } + bool do_shutdown = false; bool do_restart = false; if (osdmap->get_epoch() > 0 && @@ -3343,15 +3366,6 @@ void OSD::handle_osd_map(MOSDMap *m) // process waiters take_waiters(waiting_for_osdmap); - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - if (pg->dirty_info) - pg->write_info(t); - } - // note in the superblock that we were clean thru the prior epoch if (boot_epoch && boot_epoch >= superblock.mounted) { superblock.mounted = boot_epoch; @@ -3544,10 +3558,14 @@ void OSD::advance_map(ObjectStore::Transaction& t, C_Contexts *tfin) vector newup, newacting; osdmap->pg_to_up_acting_osds(pg->info.pgid, newup, newacting); - pg->lock_with_map_lock_held(); + //pg->lock_with_map_lock_held(); + + // update pg's osdmap ref, assert lock is held + pg->reassert_lock_with_map_lock_held(); + dout(10) << "Scanning pg " << *pg << dendl; pg->handle_advance_map(osdmap, lastmap, newup, newacting, 0); - pg->unlock(); + //pg->unlock(); } // scan pgs with waiters @@ -3577,7 +3595,7 @@ void OSD::activate_map(ObjectStore::Transaction& t, list& tfin) dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - map< int, vector > notify_list; // primary -> list + map< int, vector > > notify_list; // primary -> list map< int, map > query_map; // peer -> PG -> get_summary_since map info_map; // peer -> message @@ -3590,7 +3608,7 @@ void OSD::activate_map(ObjectStore::Transaction& t, list& tfin) it != pg_map.end(); it++) { PG *pg = it->second; - pg->lock_with_map_lock_held(); + //pg->lock_with_map_lock_held(); if (pg->is_primary()) num_pg_primary++; @@ -3605,14 +3623,16 @@ void OSD::activate_map(ObjectStore::Transaction& t, list& tfin) if (!osdmap->have_pg_pool(pg->info.pgid.pool())) { //pool is deleted! queue_pg_for_deletion(pg); - pg->unlock(); + //pg->unlock(); continue; } PG::RecoveryCtx rctx(&query_map, &info_map, ¬ify_list, &tfin, &t); pg->handle_activate_map(&rctx); + + //pg->write_if_dirty(t); - pg->unlock(); + //pg->unlock(); } do_notifies(notify_list, osdmap->get_epoch()); // notify? (residual|replica) @@ -3901,8 +3921,9 @@ void OSD::do_split(PG *parent, set& childpgids, ObjectStore::Transaction& history.epoch_created = history.same_up_since = history.same_interval_since = history.same_primary_since = osdmap->get_epoch(); + pg_interval_map_t pi; PG *pg = _create_lock_pg(*q, true, true, - parent->get_role(), parent->up, parent->acting, history, t); + parent->get_role(), parent->up, parent->acting, history, pi, t); children[*q] = pg; dout(10) << " child " << *pg << dendl; } @@ -3910,7 +3931,7 @@ void OSD::do_split(PG *parent, set& childpgids, ObjectStore::Transaction& split_pg(parent, children, t); // reset pg - map< int, vector > notify_list; // primary -> list + map< int, vector > > notify_list; // primary -> list map< int, map > query_map; // peer -> PG -> get_summary_since map info_map; // peer -> message PG::RecoveryCtx rctx(&query_map, &info_map, ¬ify_list, &tfin->contexts, &t); @@ -3924,6 +3945,7 @@ void OSD::do_split(PG *parent, set& childpgids, ObjectStore::Transaction& for (map::iterator q = children.begin(); q != children.end(); q++) { PG *pg = q->second; pg->handle_create(&rctx); + pg->write_if_dirty(t); wake_pg_waiters(pg->info.pgid); pg->unlock(); } @@ -4139,15 +4161,16 @@ void OSD::handle_pg_create(OpRequestRef op) if (can_create_pg(pgid)) { ObjectStore::Transaction *t = new ObjectStore::Transaction; C_Contexts *fin = new C_Contexts(g_ceph_context); - + pg_interval_map_t pi; PG *pg = _create_lock_pg(pgid, true, false, - 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, + 0, creating_pgs[pgid].acting, creating_pgs[pgid].acting, history, pi, *t); creating_pgs.erase(pgid); wake_pg_waiters(pg->info.pgid); PG::RecoveryCtx rctx(&query_map, &info_map, 0, &fin->contexts, t); pg->handle_create(&rctx); + pg->write_if_dirty(*t); pg->update_stats(); int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin); @@ -4173,10 +4196,10 @@ void OSD::handle_pg_create(OpRequestRef op) * content for, and they are primary for. */ -void OSD::do_notifies(map< int, vector >& notify_list, +void OSD::do_notifies(map< int, vector > >& notify_list, epoch_t query_epoch) { - for (map< int, vector >::iterator it = notify_list.begin(); + for (map< int, vector > >::iterator it = notify_list.begin(); it != notify_list.end(); it++) { if (it->first == whoami) { @@ -4217,8 +4240,8 @@ void OSD::do_infos(map& info_map) for (map::iterator p = info_map.begin(); p != info_map.end(); ++p) { - for (vector::iterator i = p->second->pg_info.begin(); - i != p->second->pg_info.end(); + for (vector >::iterator i = p->second->pg_list.begin(); + i != p->second->pg_list.end(); ++i) { dout(20) << "Sending info " << *i << " to osd." << p->first << dendl; } @@ -4253,19 +4276,19 @@ void OSD::handle_pg_notify(OpRequestRef op) map info_map; int created = 0; - for (vector::iterator it = m->get_pg_list().begin(); + for (vector >::iterator it = m->get_pg_list().begin(); it != m->get_pg_list().end(); it++) { PG *pg = 0; - if (it->pgid.preferred() >= 0) { - dout(20) << "ignoring localized pg " << it->pgid << dendl; + if (it->first.pgid.preferred() >= 0) { + dout(20) << "ignoring localized pg " << it->first.pgid << dendl; continue; } ObjectStore::Transaction *t; C_Contexts *fin; - pg = get_or_create_pg(*it, m->get_epoch(), from, created, true, &t, &fin); + pg = get_or_create_pg(it->first, it->second, m->get_epoch(), from, created, true, &t, &fin); if (!pg) continue; @@ -4278,7 +4301,8 @@ void OSD::handle_pg_notify(OpRequestRef op) } PG::RecoveryCtx rctx(&query_map, &info_map, 0, &fin->contexts, t); - pg->handle_notify(from, *it, &rctx); + pg->handle_notify(from, it->first, &rctx); + pg->write_if_dirty(*t); int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin); assert(tr == 0); @@ -4311,7 +4335,7 @@ void OSD::handle_pg_log(OpRequestRef op) int created = 0; ObjectStore::Transaction *t; C_Contexts *fin; - PG *pg = get_or_create_pg(m->info, m->get_epoch(), + PG *pg = get_or_create_pg(m->info, m->past_intervals, m->get_epoch(), from, created, false, &t, &fin); if (!pg) { return; @@ -4331,6 +4355,7 @@ void OSD::handle_pg_log(OpRequestRef op) map< int, MOSDPGInfo* > info_map; PG::RecoveryCtx rctx(&query_map, &info_map, 0, &fin->contexts, t); pg->handle_log(from, m, &rctx); + pg->write_if_dirty(*t); pg->unlock(); do_queries(query_map); do_infos(info_map); @@ -4359,17 +4384,17 @@ void OSD::handle_pg_info(OpRequestRef op) int created = 0; - for (vector::iterator p = m->pg_info.begin(); - p != m->pg_info.end(); + for (vector >::iterator p = m->pg_list.begin(); + p != m->pg_list.end(); ++p) { - if (p->pgid.preferred() >= 0) { - dout(10) << "ignoring localized pg " << p->pgid << dendl; + if (p->first.pgid.preferred() >= 0) { + dout(10) << "ignoring localized pg " << p->first.pgid << dendl; continue; } ObjectStore::Transaction *t = 0; C_Contexts *fin = 0; - PG *pg = get_or_create_pg(*p, m->get_epoch(), + PG *pg = get_or_create_pg(p->first, p->second, m->get_epoch(), from, created, false, &t, &fin); if (!pg) continue; @@ -4384,7 +4409,8 @@ void OSD::handle_pg_info(OpRequestRef op) PG::RecoveryCtx rctx(0, &info_map, 0, &fin->contexts, t); - pg->handle_info(from, *p, &rctx); + pg->handle_info(from, p->first, &rctx); + pg->write_if_dirty(*t); int tr = store->queue_transaction(&pg->osr, t, new ObjectStore::C_DeleteTransaction(t), fin); assert(!tr); @@ -4594,7 +4620,7 @@ void OSD::handle_pg_query(OpRequestRef op) op->mark_started(); - map< int, vector > notify_list; + map< int, vector > > notify_list; for (map::iterator it = m->pg_list.begin(); it != m->pg_list.end(); @@ -4635,7 +4661,7 @@ void OSD::handle_pg_query(OpRequestRef op) cluster_messenger->send_message(mlog, osdmap->get_cluster_inst(from)); } else { - notify_list[from].push_back(empty); + notify_list[from].push_back(make_pair(empty, pg_interval_map_t())); } continue; } @@ -4669,9 +4695,11 @@ void OSD::handle_pg_query(OpRequestRef op) continue; } + /* FIXME: do not do this unless/until we also write any modified history to disk. unreg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); pg->info.history.merge(it->second.history); reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp); + */ // ok, process query! PG::RecoveryCtx rctx(0, 0, ¬ify_list, 0, 0); @@ -4971,7 +4999,7 @@ void OSD::do_recovery(PG *pg) ObjectStore::Transaction *t = new ObjectStore::Transaction; C_Contexts *fin = new C_Contexts(g_ceph_context); - map< int, vector > notify_list; // primary -> list + map< int, vector > > notify_list; // primary -> list map< int, map > query_map; // peer -> PG -> get_summary_since map info_map; // peer -> message PG::RecoveryCtx rctx(&query_map, &info_map, 0, &fin->contexts, t); diff --git a/src/osd/OSD.h b/src/osd/OSD.h index 6b345f2a769d9..bfcaadf9e91af 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -471,11 +471,12 @@ class OSD : public Dispatcher { PG *_open_lock_pg(pg_t pg, bool no_lockdep_check=false, bool hold_map_lock=false); PG *_create_lock_pg(pg_t pgid, bool newly_created, bool hold_map_lock, int role, vector& up, vector& acting, pg_history_t history, - ObjectStore::Transaction& t); + pg_interval_map_t& pi, ObjectStore::Transaction& t); PG *lookup_lock_raw_pg(pg_t pgid); - PG *get_or_create_pg(const pg_info_t& info, epoch_t epoch, int from, int& pcreated, bool primary, + PG *get_or_create_pg(const pg_info_t& info, pg_interval_map_t& pi, + epoch_t epoch, int from, int& pcreated, bool primary, ObjectStore::Transaction **pt, C_Contexts **pfin); @@ -617,7 +618,7 @@ class OSD : public Dispatcher { // -- generic pg peering -- - void do_notifies(map< int, vector >& notify_list, + void do_notifies(map< int, vector > >& notify_list, epoch_t query_epoch); void do_queries(map< int, map >& query_map); void do_infos(map& info_map); diff --git a/src/osd/PG.cc b/src/osd/PG.cc index 0b83f7dcb85a9..8e1e4b232fa80 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -48,6 +48,12 @@ void PG::lock(bool no_lockdep) osd->map_lock.put_read(); _lock.Lock(no_lockdep); osdmap_ref.swap(map); + + // if we have unrecorded dirty state with the lock dropped, there is a bug + assert(!dirty_info); + assert(!dirty_log); + + dout(30) << "lock" << dendl; } /* @@ -58,6 +64,29 @@ void PG::lock_with_map_lock_held(bool no_lockdep) { _lock.Lock(no_lockdep); osdmap_ref = osd->osdmap; + + // if we have unrecorded dirty state with the lock dropped, there is a bug + assert(!dirty_info); + assert(!dirty_log); + + dout(30) << "lock_with_map_lock_held" << dendl; +} + +void PG::reassert_lock_with_map_lock_held() +{ + assert(_lock.is_locked()); + osdmap_ref = osd->osdmap; + + dout(30) << "reassert_lock_with_map_lock_held" << dendl; +} + +void PG::unlock() +{ + dout(30) << "unlock" << dendl; + assert(!dirty_info); + assert(!dirty_log); + osdmap_ref.reset(); + _lock.Unlock(); } std::string PG::gen_prefix() const @@ -235,7 +264,8 @@ bool PG::proc_replica_info(int from, pg_info_t &oinfo) might_have_unfound.insert(from); osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); - info.history.merge(oinfo.history); + if (info.history.merge(oinfo.history)) + dirty_info = true; osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); // stray? @@ -353,6 +383,9 @@ void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead) for (list::iterator d = divergent.begin(); d != divergent.end(); d++) merge_old_entry(t, *d); + + dirty_info = true; + dirty_log = true; } void PG::merge_log(ObjectStore::Transaction& t, @@ -406,6 +439,7 @@ void PG::merge_log(ObjectStore::Transaction& t, // do we have divergent entries to throw out? if (olog.head < log.head) { rewind_divergent_log(t, olog.head); + changed = true; } // extend on head? @@ -484,8 +518,8 @@ void PG::merge_log(ObjectStore::Transaction& t, dout(10) << "merge_log result " << log << " " << missing << " changed=" << changed << dendl; if (changed) { - write_info(t); - write_log(t); + dirty_info = true; + dirty_log = true; } } @@ -678,17 +712,6 @@ bool PG::needs_recovery() const void PG::generate_past_intervals() { - // Do we already have the intervals we want? - map::const_iterator pif = past_intervals.begin(); - if (pif != past_intervals.end()) { - if (pif->first <= info.history.last_epoch_clean) { - dout(10) << __func__ << ": already have past intervals back to " - << info.history.last_epoch_clean << dendl; - return; - } - past_intervals.clear(); - } - epoch_t first_epoch = 0; epoch_t stop = MAX(info.history.epoch_created, info.history.last_epoch_clean); if (stop < osd->superblock.oldest_map) @@ -701,6 +724,18 @@ void PG::generate_past_intervals() return; } + // Do we already have the intervals we want? + map::const_iterator pif = past_intervals.begin(); + if (pif != past_intervals.end()) { + if (pif->first <= stop) { + dout(10) << __func__ << " already have past intervals back to " + << stop << dendl; + return; + } + dout(10) << __func__ << " only have past intervals back to " << pif->first << dendl; + last_epoch = pif->first - 1; + } + dout(10) << __func__ << " over epochs " << stop << "-" << last_epoch << dendl; OSDMapRef nextmap = osd->get_map(last_epoch); @@ -720,7 +755,7 @@ void PG::generate_past_intervals() break; } - Interval &i = past_intervals[first_epoch]; + pg_interval_t &i = past_intervals[first_epoch]; i.first = first_epoch; i.last = last_epoch; i.up.swap(tup); @@ -760,6 +795,9 @@ void PG::generate_past_intervals() dout(10) << "generate_past_intervals " << i << " : empty" << dendl; } } + + // record our work. + dirty_info = true; } /* @@ -769,8 +807,8 @@ void PG::generate_past_intervals() */ void PG::trim_past_intervals() { - std::map::iterator pif = past_intervals.begin(); - std::map::iterator end = past_intervals.end(); + std::map::iterator pif = past_intervals.begin(); + std::map::iterator end = past_intervals.end(); while (pif != end) { if (pif->second.last >= info.history.last_epoch_clean) return; @@ -1137,10 +1175,10 @@ void PG::build_might_have_unfound() generate_past_intervals(); // We need to decide who might have unfound objects that we need - std::map::const_reverse_iterator p = past_intervals.rbegin(); - std::map::const_reverse_iterator end = past_intervals.rend(); + std::map::const_reverse_iterator p = past_intervals.rbegin(); + std::map::const_reverse_iterator end = past_intervals.rend(); for (; p != end; ++p) { - const Interval &interval(p->second); + const pg_interval_t &interval(p->second); // We already have all the objects that exist at last_epoch_clean, // so there's no need to look at earlier intervals. if (interval.last < info.history.last_epoch_clean) @@ -1222,8 +1260,8 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, need_up_thru = false; // write pg info, log - write_info(t); - write_log(t); + dirty_info = true; + dirty_log = true; // clean up stray objects clean_up_local(t); @@ -1297,7 +1335,7 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl; if (activator_map->count(peer) == 0) (*activator_map)[peer] = new MOSDPGInfo(get_osdmap()->get_epoch()); - (*activator_map)[peer]->pg_info.push_back(info); + (*activator_map)[peer]->pg_list.push_back(make_pair(info, past_intervals)); } else { dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl; m = new MOSDPGLog(get_osdmap()->get_epoch(), info); @@ -1330,12 +1368,15 @@ void PG::activate(ObjectStore::Transaction& t, list& tfin, m->log.copy_after(log, pi.last_update); } + // share past_intervals if we are creating the pg on the replica + if (pi.dne()) + m->past_intervals = past_intervals; + if (pi.last_backfill != hobject_t::get_max()) state_set(PG_STATE_BACKFILL); else active++; - // update local version of peer's missing list! if (m && pi.last_backfill != hobject_t()) { for (list::iterator p = m->log.log.begin(); @@ -1480,9 +1521,17 @@ void PG::_activate_committed(epoch_t e, entity_inst_t& primary) MOSDPGInfo *m = new MOSDPGInfo(e); pg_info_t i = info; i.history.last_epoch_started = e; - m->pg_info.push_back(i); + m->pg_list.push_back(make_pair(i, pg_interval_map_t())); osd->cluster_messenger->send_message(m, primary); } + + if (dirty_info) { + ObjectStore::Transaction *t = new ObjectStore::Transaction; + write_info(*t); + int tr = osd->store->queue_transaction(&osr, t); + assert(tr == 0); + } + unlock(); put(); } @@ -1501,10 +1550,7 @@ void PG::all_activated_and_committed() info.history.last_epoch_started = get_osdmap()->get_epoch(); share_pg_info(); - ObjectStore::Transaction *t = new ObjectStore::Transaction; - write_info(*t); - int tr = osd->store->queue_transaction(&osr, t); - assert(tr == 0); + dirty_info = true; } void PG::queue_snap_trim() @@ -1811,19 +1857,24 @@ void PG::clear_stats() * @param newup up set * @param newacting acting set * @param history pg history + * @param pi past_intervals * @param t transaction to write out our new state in */ void PG::init(int role, vector& newup, vector& newacting, pg_history_t& history, + pg_interval_map_t& pi, ObjectStore::Transaction *t) { dout(10) << "init role " << role << " up " << newup << " acting " << newacting - << " history " << history << dendl; + << " history " << history + << " " << pi.size() << " past_intervals" + << dendl; set_role(role); acting = newacting; up = newup; info.history = history; + past_intervals.swap(pi); info.stats.up = up; info.stats.acting = acting; @@ -1893,6 +1944,14 @@ void PG::write_log(ObjectStore::Transaction& t) dirty_log = false; } +void PG::write_if_dirty(ObjectStore::Transaction& t) +{ + if (dirty_info) + write_info(t); + if (dirty_log) + write_log(t); +} + void PG::trim(ObjectStore::Transaction& t, eversion_t trim_to) { // trim? @@ -3266,7 +3325,7 @@ void PG::share_pg_info() for (unsigned i=1; iget_epoch()); - m->pg_info.push_back(info); + m->pg_list.push_back(make_pair(info, pg_interval_map_t())); osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(peer)); } } @@ -3303,18 +3362,6 @@ void PG::share_pg_log() } } -void PG::fulfill_info(int from, const pg_query_t &query, - pair ¬ify_info) -{ - assert(!acting.empty()); - assert(from == acting[0]); - assert(query.type == pg_query_t::INFO); - - // info - dout(10) << "sending info" << dendl; - notify_info = make_pair(from, info); -} - void PG::fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch) { assert(!acting.empty()); @@ -3356,10 +3403,10 @@ bool PG::may_need_replay(const OSDMapRef osdmap) const { bool crashed = false; - for (map::const_reverse_iterator p = past_intervals.rbegin(); + for (map::const_reverse_iterator p = past_intervals.rbegin(); p != past_intervals.rend(); p++) { - const Interval &interval = p->second; + const pg_interval_t &interval = p->second; dout(10) << "may_need_replay " << interval << dendl; if (interval.last < info.history.last_epoch_started) @@ -3512,7 +3559,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap, dirty_info = true; } else if (acting != oldacting || up != oldup) { // remember past interval - PG::Interval& i = past_intervals[info.history.same_interval_since]; + pg_interval_t& i = past_intervals[info.history.same_interval_since]; i.first = info.history.same_interval_since; i.last = osdmap->get_epoch() - 1; i.acting = oldacting; @@ -3632,11 +3679,14 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) assert(!is_primary()); assert(is_stray() || is_active()); - if (info.last_backfill.is_max()) + if (info.last_backfill.is_max()) { info.stats = oinfo.stats; + dirty_info = true; + } osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); - info.history.merge(oinfo.history); + if (info.history.merge(oinfo.history)) + dirty_info = true; osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp); // Handle changes to purged_snaps ONLY IF we have caught up @@ -3651,7 +3701,7 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo) << " removed " << p << dendl; adjust_local_snaps(); } - write_info(t); + dirty_info = true; } } @@ -3664,6 +3714,11 @@ ostream& operator<<(ostream& out, const PG& pg) out << " r=" << pg.get_role(); out << " lpr=" << pg.get_last_peering_reset(); + if (pg.past_intervals.size()) { + out << " pi=" << pg.past_intervals.begin()->first << "-" << pg.past_intervals.rbegin()->second.last + << "/" << pg.past_intervals.size(); + } + if (pg.is_active() && pg.last_update_ondisk != pg.info.last_update) out << " luod=" << pg.last_update_ondisk; @@ -3835,6 +3890,11 @@ boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap) { PG *pg = context< RecoveryMachine >().pg; dout(10) << "Reset advmap" << dendl; + + // make sure we have past_intervals filled in. hopefully this will happen + // _before_ we are active. + pg->generate_past_intervals(); + pg->remove_down_peer_info(advmap.osdmap); if (pg->acting_up_affected(advmap.newup, advmap.newacting)) { dout(10) << "up or acting affected, calling start_peering_interval again" @@ -3849,7 +3909,7 @@ boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&) PG *pg = context< RecoveryMachine >().pg; if (pg->is_stray() && pg->get_primary() >= 0) { context< RecoveryMachine >().send_notify(pg->get_primary(), - pg->info); + pg->info, pg->past_intervals); } pg->update_heartbeat_peers(); @@ -4237,7 +4297,7 @@ boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&) PG *pg = context< RecoveryMachine >().pg; if (pg->is_stray() && pg->get_primary() >= 0) { context< RecoveryMachine >().send_notify(pg->get_primary(), - pg->info); + pg->info, pg->past_intervals); } return discard_event(); } @@ -4322,9 +4382,11 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query) { PG *pg = context< RecoveryMachine >().pg; if (query.query.type == pg_query_t::INFO) { - pair notify_info; - pg->fulfill_info(query.from, query.query, notify_info); - context< RecoveryMachine >().send_notify(notify_info.first, notify_info.second); + dout(10) << "sending info to osd." << query.from << dendl; + assert(!pg->acting.empty()); + assert(query.from == pg->acting[0]); + assert(query.query.type == pg_query_t::INFO); + context< RecoveryMachine >().send_notify(query.from, pg->info, pg->past_intervals); } else { pg->fulfill_log(query.from, query.query, query.query_epoch); } @@ -4336,7 +4398,7 @@ boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&) PG *pg = context< RecoveryMachine >().pg; if (pg->is_stray() && pg->get_primary() >= 0) { context< RecoveryMachine >().send_notify(pg->get_primary(), - pg->info); + pg->info, pg->past_intervals); } return discard_event(); } @@ -4439,14 +4501,14 @@ boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& in * that interval. */ if (pg->info.history.last_epoch_started) { - for (map::reverse_iterator p = pg->past_intervals.rbegin(); + for (map::reverse_iterator p = pg->past_intervals.rbegin(); p != pg->past_intervals.rend(); ++p) { if (p->first < pg->info.history.last_epoch_started) break; if (!p->second.maybe_went_rw) continue; - Interval& interval = p->second; + pg_interval_t& interval = p->second; dout(10) << " last maybe_went_rw interval was " << interval << dendl; OSDMapRef osdmap = pg->get_osdmap(); @@ -4969,7 +5031,7 @@ void PG::RecoveryState::handle_query_state(Formatter *f) #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ") PG::PriorSet::PriorSet(const OSDMap &osdmap, - const map &past_intervals, + const map &past_intervals, const vector &up, const vector &acting, const pg_info_t &info, @@ -5030,10 +5092,10 @@ PG::PriorSet::PriorSet(const OSDMap &osdmap, for (unsigned i=0; i::const_reverse_iterator p = past_intervals.rbegin(); + for (map::const_reverse_iterator p = past_intervals.rbegin(); p != past_intervals.rend(); p++) { - const Interval &interval = p->second; + const pg_interval_t &interval = p->second; dout(10) << "build_prior " << interval << dendl; if (interval.last < info.history.last_epoch_started) diff --git a/src/osd/PG.h b/src/osd/PG.h index e8a31bbf982e6..bb7491199864b 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -358,16 +358,15 @@ class PG { bool deleting; // true while RemoveWQ should be chewing on us void lock(bool no_lockdep = false); - void unlock() { - //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl; - osdmap_ref.reset(); - _lock.Unlock(); - } + void unlock(); /* During handle_osd_map, the osd holds a write lock to the osdmap. * *_with_map_lock_held assume that the map_lock is already held */ void lock_with_map_lock_held(bool no_lockdep = false); + // assert we still have lock held, and update our map ref + void reassert_lock_with_map_lock_held(); + void assert_locked() { assert(_lock.is_locked()); } @@ -400,56 +399,6 @@ class PG { bool dirty_info, dirty_log; public: - struct Interval { - vector up, acting; - epoch_t first, last; - bool maybe_went_rw; - - Interval() : first(0), last(0), maybe_went_rw(false) {} - - void encode(bufferlist& bl) const { - ENCODE_START(2, 2, bl); - ::encode(first, bl); - ::encode(last, bl); - ::encode(up, bl); - ::encode(acting, bl); - ::encode(maybe_went_rw, bl); - ENCODE_FINISH(bl); - } - void decode(bufferlist::iterator& bl) { - DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); - ::decode(first, bl); - ::decode(last, bl); - ::decode(up, bl); - ::decode(acting, bl); - ::decode(maybe_went_rw, bl); - DECODE_FINISH(bl); - } - void dump(Formatter *f) const { - f->dump_unsigned("first", first); - f->dump_unsigned("last", last); - f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0); - f->open_array_section("up"); - for (vector::const_iterator p = up.begin(); p != up.end(); ++p) - f->dump_int("osd", *p); - f->close_section(); - f->open_array_section("acting"); - for (vector::const_iterator p = acting.begin(); p != acting.end(); ++p) - f->dump_int("osd", *p); - f->close_section(); - } - static void generate_test_instances(list& o) { - o.push_back(new Interval); - o.push_back(new Interval); - o.back()->up.push_back(1); - o.back()->acting.push_back(2); - o.back()->acting.push_back(3); - o.back()->first = 4; - o.back()->last = 5; - o.back()->maybe_went_rw = true; - } - }; - WRITE_CLASS_ENCODER(Interval) // pg state pg_info_t info; @@ -463,7 +412,7 @@ class PG { set missing_loc_sources; // superset of missing_loc locations interval_set snap_collections; - map past_intervals; + map past_intervals; interval_set snap_trimq; @@ -505,7 +454,7 @@ class PG { bool pg_down; /// some down osds are included in @a cur; the DOWN pg state bit should be set. PriorSet(const OSDMap &osdmap, - const map &past_intervals, + const map &past_intervals, const vector &up, const vector &acting, const pg_info_t &info, @@ -525,14 +474,14 @@ class PG { utime_t start_time; map< int, map > *query_map; map< int, MOSDPGInfo* > *info_map; - map< int, vector > *notify_list; + map< int, vector > > *notify_list; list< Context* > *context_list; ObjectStore::Transaction *transaction; RecoveryCtx() : query_map(0), info_map(0), notify_list(0), context_list(0), transaction(0) {} RecoveryCtx(map< int, map > *query_map, map< int, MOSDPGInfo* > *info_map, - map< int, vector > *notify_list, + map< int, vector > > *notify_list, list< Context* > *context_list, ObjectStore::Transaction *transaction) : query_map(query_map), info_map(info_map), @@ -953,9 +902,9 @@ class PG { return state->rctx->context_list; } - void send_notify(int to, const pg_info_t &info) { + void send_notify(int to, const pg_info_t& info, const pg_interval_map_t& pi) { assert(state->rctx->notify_list); - (*state->rctx->notify_list)[to].push_back(info); + (*state->rctx->notify_list)[to].push_back(make_pair(info, pi)); } }; friend class RecoveryMachine; @@ -1341,7 +1290,7 @@ class PG { bool is_empty() const { return info.last_update == eversion_t(0,0); } void init(int role, vector& up, vector& acting, pg_history_t& history, - ObjectStore::Transaction *t); + pg_interval_map_t& pim, ObjectStore::Transaction *t); // pg on-disk state void do_pending_flush(); @@ -1349,6 +1298,8 @@ class PG { void write_info(ObjectStore::Transaction& t); void write_log(ObjectStore::Transaction& t); + void write_if_dirty(ObjectStore::Transaction& t); + void add_log_entry(pg_log_entry_t& e, bufferlist& log_bl); void append_log(vector& logv, eversion_t trim_to, ObjectStore::Transaction &t); @@ -1381,8 +1332,6 @@ class PG { const vector& newacting); void set_last_peering_reset(); - void fulfill_info(int from, const pg_query_t &query, - pair ¬ify_info); void fulfill_log(int from, const pg_query_t &query, epoch_t query_epoch); bool acting_up_affected(const vector& newup, const vector& newacting); bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); @@ -1458,18 +1407,8 @@ class PG { utime_t expire) = 0; }; -WRITE_CLASS_ENCODER(PG::Interval) WRITE_CLASS_ENCODER(PG::OndiskLog) -inline ostream& operator<<(ostream& out, const PG::Interval& i) -{ - out << "interval(" << i.first << "-" << i.last << " " << i.up << "/" << i.acting; - if (i.maybe_went_rw) - out << " maybe_went_rw"; - out << ")"; - return out; -} - ostream& operator<<(ostream& out, const PG& pg); #endif diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 5b398af85094f..f713a6da7fc73 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -6479,7 +6479,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed) // tell replicas for (unsigned i=1; iget_epoch()); - m->pg_info.push_back(info); + m->pg_list.push_back(make_pair(info, pg_interval_map_t())); osd->cluster_messenger->send_message(m, get_osdmap()->get_cluster_inst(acting[i])); } } diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 050dfe15a6028..b9bc50c8cf4db 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -1302,6 +1302,68 @@ void pg_info_t::generate_test_instances(list& o) } +// -- pg_interval_t -- + +void pg_interval_t::encode(bufferlist& bl) const +{ + ENCODE_START(2, 2, bl); + ::encode(first, bl); + ::encode(last, bl); + ::encode(up, bl); + ::encode(acting, bl); + ::encode(maybe_went_rw, bl); + ENCODE_FINISH(bl); +} + +void pg_interval_t::decode(bufferlist::iterator& bl) +{ + DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl); + ::decode(first, bl); + ::decode(last, bl); + ::decode(up, bl); + ::decode(acting, bl); + ::decode(maybe_went_rw, bl); + DECODE_FINISH(bl); +} + +void pg_interval_t::dump(Formatter *f) const +{ + f->dump_unsigned("first", first); + f->dump_unsigned("last", last); + f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0); + f->open_array_section("up"); + for (vector::const_iterator p = up.begin(); p != up.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); + f->open_array_section("acting"); + for (vector::const_iterator p = acting.begin(); p != acting.end(); ++p) + f->dump_int("osd", *p); + f->close_section(); +} + +void pg_interval_t::generate_test_instances(list& o) +{ + o.push_back(new pg_interval_t); + o.push_back(new pg_interval_t); + o.back()->up.push_back(1); + o.back()->acting.push_back(2); + o.back()->acting.push_back(3); + o.back()->first = 4; + o.back()->last = 5; + o.back()->maybe_went_rw = true; +} + +ostream& operator<<(ostream& out, const pg_interval_t& i) +{ + out << "interval(" << i.first << "-" << i.last << " " << i.up << "/" << i.acting; + if (i.maybe_went_rw) + out << " maybe_went_rw"; + out << ")"; + return out; +} + + + // -- pg_query_t -- void pg_query_t::dump(Formatter *f) const diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 23833153b4547..493219cb9f650 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -926,20 +926,34 @@ struct pg_history_t { last_epoch_started(0), last_epoch_clean(0), last_epoch_split(0), same_up_since(0), same_interval_since(0), same_primary_since(0) {} - void merge(const pg_history_t &other) { + bool merge(const pg_history_t &other) { // Here, we only update the fields which cannot be calculated from the OSDmap. - if (epoch_created < other.epoch_created) + bool modified = false; + if (epoch_created < other.epoch_created) { epoch_created = other.epoch_created; - if (last_epoch_started < other.last_epoch_started) + modified = true; + } + if (last_epoch_started < other.last_epoch_started) { last_epoch_started = other.last_epoch_started; - if (last_epoch_clean < other.last_epoch_clean) + modified = true; + } + if (last_epoch_clean < other.last_epoch_clean) { last_epoch_clean = other.last_epoch_clean; - if (last_epoch_split < other.last_epoch_started) - last_epoch_split = other.last_epoch_started; - if (other.last_scrub > last_scrub) + modified = true; + } + if (last_epoch_split < other.last_epoch_started) { + last_epoch_split = other.last_epoch_started; + modified = true; + } + if (other.last_scrub > last_scrub) { last_scrub = other.last_scrub; - if (other.last_scrub_stamp > last_scrub_stamp) + modified = true; + } + if (other.last_scrub_stamp > last_scrub_stamp) { last_scrub_stamp = other.last_scrub_stamp; + modified = true; + } + return modified; } void encode(bufferlist& bl) const; @@ -1023,6 +1037,28 @@ inline ostream& operator<<(ostream& out, const pg_info_t& pgi) } +/** + * pg_interval_t - information about a past interval + */ +struct pg_interval_t { + vector up, acting; + epoch_t first, last; + bool maybe_went_rw; + + pg_interval_t() : first(0), last(0), maybe_went_rw(false) {} + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(pg_interval_t) + +ostream& operator<<(ostream& out, const pg_interval_t& i); + +typedef map pg_interval_map_t; + + /** * pg_query_t - used to ask a peer for information about a pg. * diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 245d4fe0a0555..5b50147c52070 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -26,7 +26,6 @@ TYPE(OSDMap::Incremental) TYPE(CrushWrapper) #include "osd/PG.h" -TYPE(PG::Interval) TYPE(PG::OndiskLog) #include "osd/osd_types.h" @@ -44,6 +43,7 @@ TYPE(pg_stat_t) TYPE(pool_stat_t) TYPE(pg_history_t) TYPE(pg_info_t) +TYPE(pg_interval_t) TYPE(pg_query_t) TYPE(pg_log_entry_t) TYPE(pg_log_t)