Skip to content

Commit

Permalink
mds: use 128 bits for waiters on MDSCacheObject
Browse files Browse the repository at this point in the history
Adding a new inode lock will overflow inode wait bits into the MDSCacheObject
wait bits. Make space for the quiescelock.

This includes a minor refactor to no longer attempt scoping the set of masks we
test in MDSCacheObject::waiting when calling MDSCacheObject::is_waiter_for.
This optimization wasn't worth the overhead and would be awkard to keep as
std::bitset cannot be used as a key for a std::multimap (easily).  Instead, we
use the sequence number as a key which helps us to avoid allocating another map
whenever we call MDSCacheObject::take_waiting.

Signed-off-by: Patrick Donnelly <[email protected]>
  • Loading branch information
batrick committed Mar 20, 2024
1 parent a9964a7 commit 84b33ea
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 72 deletions.
3 changes: 2 additions & 1 deletion src/mds/CDir.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,11 @@ class CDir : public MDSCacheObject, public Counter<CDir> {
static const uint64_t WAIT_COMPLETE = (1<<1); // wait for complete dir contents
static const uint64_t WAIT_FROZEN = (1<<2); // auth pins removed
static const uint64_t WAIT_CREATED = (1<<3); // new dirfrag is logged
static const uint64_t WAIT_BITS = 4;

static const int WAIT_DNLOCK_OFFSET = 4;

static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
static const uint64_t WAIT_ANY_MASK = ((1ul << WAIT_BITS) - 1);
static const uint64_t WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH);

// -- dump flags --
Expand Down
3 changes: 2 additions & 1 deletion src/mds/CInode.h
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,9 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CIno
static const uint64_t WAIT_FROZEN = (1<<1);
static const uint64_t WAIT_TRUNC = (1<<2);
static const uint64_t WAIT_FLOCK = (1<<3);
static const uint64_t WAIT_BITS = 4;

static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
static const uint64_t WAIT_ANY_MASK = ((1ul << WAIT_BITS) - 1);

// misc
static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
Expand Down
53 changes: 16 additions & 37 deletions src/mds/MDSCacheObject.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ std::string_view MDSCacheObject::generic_pin_name(int p) const {
}
}

void MDSCacheObject::finish_waiting(uint64_t mask, int result) {
void MDSCacheObject::finish_waiting(waitmask_t mask, int result) {
MDSContext::vec finished;
take_waiting(mask, finished);
finish_contexts(g_ceph_context, finished, result);
Expand Down Expand Up @@ -85,52 +85,31 @@ void MDSCacheObject::dump_states(ceph::Formatter *f) const
f->dump_string("state", "rejoinundef");
}

bool MDSCacheObject::is_waiter_for(uint64_t mask, uint64_t min) {
if (!min) {
min = mask;
while (min & (min-1)) // if more than one bit is set
min &= min-1; // clear LSB
}
for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) {
if (p->first & mask) return true;
if (p->first > mask) return false;
bool MDSCacheObject::is_waiter_for(waitmask_t mask) {
for ([[maybe_unused]] auto& [seq, waiter] : waiting) {
if ((waiter.mask & mask).any()) {
return true;
}
}
return false;
}

void MDSCacheObject::take_waiting(uint64_t mask, MDSContext::vec& ls) {
if (waiting.empty()) return;

// process ordered waiters in the same order that they were added.
std::map<uint64_t, MDSContext*> ordered_waiters;

void MDSCacheObject::take_waiting(waitmask_t mask, MDSContext::vec& ls) {
if (waiting.empty()) {
return;
}
for (auto it = waiting.begin(); it != waiting.end(); ) {
if (it->first & mask) {
if (it->second.first > 0) {
ordered_waiters.insert(it->second);
} else {
ls.push_back(it->second.second);
}
// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this))
// << "take_waiting mask " << hex << mask << dec << " took " << it->second
// << " tag " << hex << it->first << dec
// << " on " << *this
// << dendl;
waiting.erase(it++);
auto& waiter = it->second;
if ((waiter.mask & mask).any()) {
ls.push_back(waiter.c);
it = waiting.erase(it);
} else {
// pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second
// << " tag " << hex << it->first << dec
// << " on " << *this
// << dendl;
++it;
++it;
}
}
for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) {
ls.push_back(it->second);
}
if (waiting.empty()) {
put(PIN_WAITER);
waiting.clear();
waiting.clear(); // free internal map
}
}

Expand Down
59 changes: 43 additions & 16 deletions src/mds/MDSCacheObject.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,16 @@ class MDSCacheObject {
public:
typedef mempool::mds_co::compact_map<mds_rank_t,unsigned> replica_map_type;

/* Mask for waiters. It is 128 bits to accomodate lock waiters. Its layout:
*
* Most-significant Least significant
* [ SimpleLock 64 bits | MDSCacheObject 64 bits ]
*
* It is organized this way for simplicity not for compactness and because
* the total bits will be >64 bits.
*/
using waitmask_t = std::bitset<128>;

struct ptr_lt {
bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const {
return l->is_lt(r);
Expand Down Expand Up @@ -261,27 +271,39 @@ class MDSCacheObject {
unsigned get_replica_nonce() const { return replica_nonce; }
void set_replica_nonce(unsigned n) { replica_nonce = n; }

bool is_waiter_for(uint64_t mask, uint64_t min=0);
/* A uint64_t mask is accepted to accomodate existing code that expects the
* mask to actually be a 64 bit integer.
*/
bool is_waiter_for(uint64_t mask) {
return is_waiter_for(waitmask_t(mask));
}
bool is_waiter_for(waitmask_t mask);

virtual void add_waiter(uint64_t mask, MDSContext *c) {
add_waiter(waitmask_t(mask), c);
}
void add_waiter(waitmask_t mask, MDSContext *c) {
if (waiting.empty())
get(PIN_WAITER);

uint64_t seq = 0;
if (mask & WAIT_ORDERED) {
waiter_seq_t seq = 0;
if ((mask & waitmask_t(WAIT_ORDERED)).any()) {
seq = ++last_wait_seq;
mask &= ~WAIT_ORDERED;
mask &= ~waitmask_t(WAIT_ORDERED);
} else {
/* always at the front */
seq = 0;
}
waiting.insert(std::pair<uint64_t, std::pair<uint64_t, MDSContext*> >(
mask,
std::pair<uint64_t, MDSContext*>(seq, c)));
// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this))
// << "add_waiter " << hex << mask << dec << " " << c
// << " on " << *this
// << dendl;

waiting.insert(std::pair<waiter_seq_t, waiter>(seq, waiter(mask, c)));
}
virtual void take_waiting(uint64_t mask, MDSContext::vec& ls) {
take_waiting(waitmask_t(mask), ls);
}
virtual void take_waiting(uint64_t mask, MDSContext::vec& ls);
void finish_waiting(uint64_t mask, int result = 0);
void take_waiting(waitmask_t mask, MDSContext::vec& ls);
void finish_waiting(uint64_t mask, int result = 0) {
finish_waiting(waitmask_t(mask), result);
}
void finish_waiting(waitmask_t mask, int result = 0);

// ---------------------------------------------
// locking
Expand Down Expand Up @@ -322,8 +344,13 @@ class MDSCacheObject {
// ---------------------------------------------
// waiting
private:
mempool::mds_co::compact_multimap<uint64_t, std::pair<uint64_t, MDSContext*>> waiting;
static uint64_t last_wait_seq;
using waiter_seq_t = uint64_t;
struct waiter {
waitmask_t mask;
MDSContext* c;
};
mempool::mds_co::compact_multimap<waiter_seq_t, struct waiter> waiting;
static waiter_seq_t last_wait_seq;
};

inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) {
Expand Down
24 changes: 12 additions & 12 deletions src/mds/SimpleLock.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,18 @@ void SimpleLock::dump(ceph::Formatter *f) const {

int SimpleLock::get_wait_shift() const {
switch (get_type()) {
case CEPH_LOCK_DN: return 8;
case CEPH_LOCK_DVERSION: return 8 + 1*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IAUTH: return 8 + 2*SimpleLock::WAIT_BITS;
case CEPH_LOCK_ILINK: return 8 + 3*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IDFT: return 8 + 4*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IFILE: return 8 + 5*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IVERSION: return 8 + 6*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IXATTR: return 8 + 7*SimpleLock::WAIT_BITS;
case CEPH_LOCK_ISNAP: return 8 + 8*SimpleLock::WAIT_BITS;
case CEPH_LOCK_INEST: return 8 + 9*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IFLOCK: return 8 +10*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IPOLICY: return 8 +11*SimpleLock::WAIT_BITS;
case CEPH_LOCK_DN: return 0;
case CEPH_LOCK_DVERSION: return 1*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IAUTH: return 2*SimpleLock::WAIT_BITS;
case CEPH_LOCK_ILINK: return 3*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IDFT: return 4*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IFILE: return 5*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IVERSION: return 6*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IXATTR: return 7*SimpleLock::WAIT_BITS;
case CEPH_LOCK_ISNAP: return 8*SimpleLock::WAIT_BITS;
case CEPH_LOCK_INEST: return 9*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IFLOCK: return 10*SimpleLock::WAIT_BITS;
case CEPH_LOCK_IPOLICY: return 11*SimpleLock::WAIT_BITS;
default:
ceph_abort();
}
Expand Down
25 changes: 20 additions & 5 deletions src/mds/SimpleLock.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@ class SimpleLock {
int get_type() const { return type->type; }
const sm_t* get_sm() const { return type->sm; }

int get_wait_shift() const;
int get_cap_shift() const;
int get_cap_mask() const;

Expand All @@ -210,17 +209,33 @@ class SimpleLock {
void encode_locked_state(ceph::buffer::list& bl) {
parent->encode_lock_state(type->type, bl);
}

using waitmask_t = MDSCacheObject::waitmask_t;
static constexpr auto WAIT_ORDERED = waitmask_t(MDSCacheObject::WAIT_ORDERED);
int get_wait_shift() const;
MDSCacheObject::waitmask_t getmask(uint64_t mask) const {
/* See definition of MDSCacheObject::waitmask_t for waiter bits reserved
* for SimpleLock.
*/
static constexpr int simplelock_shift = 64;
auto waitmask = waitmask_t(mask);
int shift = get_wait_shift();
ceph_assert(shift < 64);
shift += simplelock_shift;
waitmask <<= shift;
return waitmask;
}
void finish_waiters(uint64_t mask, int r=0) {
parent->finish_waiting(mask << get_wait_shift(), r);
parent->finish_waiting(getmask(mask), r);
}
void take_waiting(uint64_t mask, MDSContext::vec& ls) {
parent->take_waiting(mask << get_wait_shift(), ls);
parent->take_waiting(getmask(mask), ls);
}
void add_waiter(uint64_t mask, MDSContext *c) {
parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c);
parent->add_waiter(getmask(mask) | WAIT_ORDERED, c);
}
bool is_waiter_for(uint64_t mask) const {
return parent->is_waiter_for(mask << get_wait_shift());
return parent->is_waiter_for(getmask(mask));
}

bool is_cached() const {
Expand Down

0 comments on commit 84b33ea

Please sign in to comment.