Skip to content

Commit

Permalink
librbd: add more robust retry handling to maintenance ops
Browse files Browse the repository at this point in the history
When image locking is enabled, snapshot create, resize, and
flatten are coordinated with the lock owner.  Previously, if the
the lock owner changed during one of this operations, the
operation would fail.  Now librbd will attempt to restart the
operation with the new lock owner (or become the owner itself).

Signed-off-by: Jason Dillaman <[email protected]>
  • Loading branch information
Jason Dillaman authored and jdurgin committed Jan 24, 2015
1 parent 1b6467b commit cd9d8eb
Show file tree
Hide file tree
Showing 4 changed files with 185 additions and 110 deletions.
62 changes: 52 additions & 10 deletions src/librbd/ImageWatcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -267,12 +267,21 @@ int ImageWatcher::request_lock(
}

bool ImageWatcher::try_request_lock() {
RWLock::WLocker l(m_image_ctx.owner_lock);
assert(m_image_ctx.owner_lock.is_locked());
if (is_lock_owner()) {
return true;
}

int r = try_lock();
int r = 0;
m_image_ctx.owner_lock.put_read();
{
RWLock::WLocker l(m_image_ctx.owner_lock);
if (!is_lock_owner()) {
r = try_lock();
}
}
m_image_ctx.owner_lock.get_read();

if (r < 0) {
ldout(m_image_ctx.cct, 5) << "failed to acquire exclusive lock:"
<< cpp_strerror(r) << dendl;
Expand All @@ -292,8 +301,14 @@ bool ImageWatcher::try_request_lock() {
void ImageWatcher::finalize_request_lock() {
cancel_retry_aio_requests();

if (try_request_lock()) {
bool owned_lock;
{
RWLock::RLocker l(m_image_ctx.owner_lock);
owned_lock = try_request_lock();
}
if (owned_lock) {
retry_aio_requests();

} else {
schedule_retry_aio_requests();
}
Expand Down Expand Up @@ -450,6 +465,9 @@ int ImageWatcher::notify_async_complete(const RemoteAsyncRequest &request,
}

int ImageWatcher::notify_flatten(ProgressContext &prog_ctx) {
assert(m_image_ctx.owner_lock.is_locked());
assert(!is_lock_owner());

bufferlist bl;
uint64_t async_request_id;
ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
Expand All @@ -461,6 +479,9 @@ int ImageWatcher::notify_flatten(ProgressContext &prog_ctx) {
}

int ImageWatcher::notify_resize(uint64_t size, ProgressContext &prog_ctx) {
assert(m_image_ctx.owner_lock.is_locked());
assert(!is_lock_owner());

bufferlist bl;
uint64_t async_request_id;
ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
Expand All @@ -473,6 +494,9 @@ int ImageWatcher::notify_resize(uint64_t size, ProgressContext &prog_ctx) {
}

int ImageWatcher::notify_snap_create(const std::string &snap_name) {
assert(m_image_ctx.owner_lock.is_locked());
assert(!is_lock_owner());

bufferlist bl;
ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, bl);
::encode(NOTIFY_OP_SNAP_CREATE, bl);
Expand Down Expand Up @@ -599,10 +623,14 @@ uint64_t ImageWatcher::encode_async_request(bufferlist &bl) {

int ImageWatcher::decode_response_code(bufferlist &bl) {
int r;
bufferlist::iterator iter = bl.begin();
DECODE_START(NOTIFY_VERSION, iter);
::decode(r, iter);
DECODE_FINISH(iter);
try {
bufferlist::iterator iter = bl.begin();
DECODE_START(NOTIFY_VERSION, iter);
::decode(r, iter);
DECODE_FINISH(iter);
} catch (const buffer::error &err) {
r = -EINVAL;
}
return r;
}

Expand All @@ -617,7 +645,9 @@ void ImageWatcher::notify_released_lock() {
void ImageWatcher::notify_request_lock() {
cancel_retry_aio_requests();

m_image_ctx.owner_lock.get_read();
if (try_request_lock()) {
m_image_ctx.owner_lock.put_read();
retry_aio_requests();
return;
}
Expand All @@ -629,6 +659,8 @@ void ImageWatcher::notify_request_lock() {

bufferlist response;
int r = notify_lock_owner(bl, response);
m_image_ctx.owner_lock.put_read();

if (r == -ETIMEDOUT) {
ldout(m_image_ctx.cct, 5) << "timed out requesting lock: retrying" << dendl;
retry_aio_requests();
Expand All @@ -640,9 +672,15 @@ void ImageWatcher::notify_request_lock() {
}

int ImageWatcher::notify_lock_owner(bufferlist &bl, bufferlist& response) {
assert(m_image_ctx.owner_lock.is_locked());

// since we need to ack our own notifications, release the owner lock just in
// case another notification occurs before this one and it requires the lock
bufferlist response_bl;
m_image_ctx.owner_lock.put_read();
int r = m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT,
&response_bl);
m_image_ctx.owner_lock.get_read();
if (r < 0 && r != -ETIMEDOUT) {
lderr(m_image_ctx.cct) << "lock owner notification failed: "
<< cpp_strerror(r) << dendl;
Expand Down Expand Up @@ -683,6 +721,8 @@ int ImageWatcher::notify_lock_owner(bufferlist &bl, bufferlist& response) {
int ImageWatcher::notify_async_request(uint64_t async_request_id,
bufferlist &in,
ProgressContext& prog_ctx) {
assert(m_image_ctx.owner_lock.is_locked());

Mutex my_lock("librbd::ImageWatcher::notify_async_request::my_lock");
Cond cond;
bool done = false;
Expand Down Expand Up @@ -734,13 +774,16 @@ void ImageWatcher::handle_acquired_lock() {

void ImageWatcher::handle_released_lock() {
ldout(m_image_ctx.cct, 20) << "exclusive lock released" << dendl;
FunctionContext *ctx = new FunctionContext(
boost::bind(&ImageWatcher::cancel_async_requests, this, -ERESTART));
m_finisher->queue(ctx);

Mutex::Locker l(m_aio_request_lock);
if (!m_aio_requests.empty()) {
ldout(m_image_ctx.cct, 20) << "queuing lock request" << dendl;
FunctionContext *ctx = new FunctionContext(
FunctionContext *req_ctx = new FunctionContext(
boost::bind(&ImageWatcher::finalize_request_lock, this));
m_finisher->queue(ctx);
m_finisher->queue(req_ctx);
}
}

Expand Down Expand Up @@ -867,7 +910,6 @@ void ImageWatcher::handle_snap_create(bufferlist::iterator iter, bufferlist *out
::decode(snap_name, iter);

ldout(m_image_ctx.cct, 20) << "remote snap_create request: " << snap_name << dendl;

int r = librbd::snap_create(&m_image_ctx, snap_name.c_str(), false);
ENCODE_START(NOTIFY_VERSION, NOTIFY_VERSION, *out);
::encode(r, *out);
Expand Down
1 change: 0 additions & 1 deletion src/librbd/ImageWatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ namespace librbd {
int notify_async_request(uint64_t async_request_id, bufferlist &in,
ProgressContext& prog_ctx);
void notify_request_leadership();
int notify_leader(bufferlist &bl, bufferlist &response);

void handle_header_update();
void handle_acquired_lock();
Expand Down
Loading

0 comments on commit cd9d8eb

Please sign in to comment.