Skip to content

Commit

Permalink
[yugabyte#9781] Mark snapshot as deleted if tablet was removed
Browse files Browse the repository at this point in the history
Summary:
During restoration we could delete tablets that was created after restoration time.
When we are trying to cleanup snapshots containing those tablets delete snapshot could fails.
It happens because master cannot find such tablet.

Handle this scenario and mark snapshot as deleted on such tablet.

Test Plan: YbAdminSnapshotScheduleTest.DeleteIndexOnRestore

Reviewers: bogdan, skedia

Reviewed By: skedia

Subscribers: ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D12784
  • Loading branch information
spolitov committed Aug 30, 2021
1 parent f57d285 commit a43cde5
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 47 deletions.
20 changes: 14 additions & 6 deletions ent/src/yb/tools/yb-admin_client_ent.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,13 @@ Status ClusterAdminClient::ListSnapshots(const ListSnapshotsFlags& flags) {
AddStringField("current_snapshot_id",
SnapshotIdToString(resp.current_snapshot_id()),
&document, &document.GetAllocator());

} else {
cout << "Current snapshot id: " << SnapshotIdToString(resp.current_snapshot_id()) << endl;
}
}

rapidjson::Value json_snapshots(rapidjson::kArrayType);
if (json) {
document.AddMember("snapshots", json_snapshots, document.GetAllocator());
} else {
if (!json) {
if (resp.snapshots_size()) {
cout << RightPadToUuidWidth("Snapshot UUID") << kColumnSep << "State" << endl;
} else {
Expand All @@ -135,9 +132,17 @@ Status ClusterAdminClient::ListSnapshots(const ListSnapshotsFlags& flags) {
if (json) {
AddStringField(
"id", SnapshotIdToString(snapshot.id()), &json_snapshot, &document.GetAllocator());
const auto& entry = snapshot.entry();
AddStringField(
"state", SysSnapshotEntryPB::State_Name(snapshot.entry().state()), &json_snapshot,
"state", SysSnapshotEntryPB::State_Name(entry.state()), &json_snapshot,
&document.GetAllocator());
AddStringField(
"snapshot_time", HybridTimeToString(HybridTime::FromPB(entry.snapshot_hybrid_time())),
&json_snapshot, &document.GetAllocator());
AddStringField(
"previous_snapshot_time",
HybridTimeToString(HybridTime::FromPB(entry.previous_snapshot_hybrid_time())),
&json_snapshot, &document.GetAllocator());
} else {
cout << SnapshotIdToString(snapshot.id()) << kColumnSep << snapshot.entry().state() << endl;
}
Expand Down Expand Up @@ -186,6 +191,7 @@ Status ClusterAdminClient::ListSnapshots(const ListSnapshotsFlags& flags) {
}));

if (json) {
document.AddMember("snapshots", json_snapshots, document.GetAllocator());
std::cout << common::PrettyWriteRapidJsonToString(document) << std::endl;
return Status::OK();
}
Expand Down Expand Up @@ -436,7 +442,9 @@ Result<rapidjson::Document> ClusterAdminClient::DeleteSnapshotSchedule(
}

bool SnapshotSuitableForRestoreAt(const SysSnapshotEntryPB& entry, HybridTime restore_at) {
return HybridTime::FromPB(entry.snapshot_hybrid_time()) >= restore_at &&
return (entry.state() == master::SysSnapshotEntryPB::COMPLETE ||
entry.state() == master::SysSnapshotEntryPB::CREATING) &&
HybridTime::FromPB(entry.snapshot_hybrid_time()) >= restore_at &&
HybridTime::FromPB(entry.previous_snapshot_hybrid_time()) < restore_at;
}

Expand Down
3 changes: 2 additions & 1 deletion src/yb/master/catalog_entity_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ void TabletInfo::SetReplicaLocations(

CHECKED_STATUS TabletInfo::CheckRunning() const {
if (!table()->is_running()) {
return STATUS_FORMAT(Expired, "Table is not running: $0", table()->ToStringWithState());
return STATUS_EC_FORMAT(Expired, MasterError(MasterErrorPB::TABLE_NOT_RUNNING),
"Table is not running: $0", table()->ToStringWithState());
}

return Status::OK();
Expand Down
2 changes: 2 additions & 0 deletions src/yb/master/master.proto
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ message MasterErrorPB {

// Error in case a tablet-level operation was attempted on a tablet which is not running.
TABLET_NOT_RUNNING = 38;

TABLE_NOT_RUNNING = 39;
}

// The error code.
Expand Down
33 changes: 18 additions & 15 deletions src/yb/master/master_snapshot_coordinator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ class MasterSnapshotCoordinator::Impl {
const TxnSnapshotId& snapshot_id, bool list_deleted, ListSnapshotsResponsePB* resp) {
std::lock_guard<std::mutex> lock(mutex_);
if (snapshot_id.IsNil()) {
for (const auto& p : snapshots_) {
for (const auto& p : snapshots_.get<ScheduleTag>()) {
if (!list_deleted) {
auto aggreaged_state = p->AggregatedState();
if (aggreaged_state.ok() && *aggreaged_state == SysSnapshotEntryPB::DELETED) {
Expand Down Expand Up @@ -677,7 +677,8 @@ class MasterSnapshotCoordinator::Impl {
&mutex_, snapshots_, operation.snapshot_id, operation.tablet_id,
std::bind(&Impl::UpdateSnapshot, this, _1, leader_term, _2));
if (!tablet_info) {
callback(STATUS_FORMAT(NotFound, "Tablet info not found for $0", operation.tablet_id));
callback(STATUS_EC_FORMAT(NotFound, MasterError(MasterErrorPB::TABLET_NOT_RUNNING),
"Tablet info not found for $0", operation.tablet_id));
return;
}
auto snapshot_id_str = operation.snapshot_id.AsSlice().ToBuffer();
Expand Down Expand Up @@ -754,22 +755,24 @@ class MasterSnapshotCoordinator::Impl {
TryDeleteSnapshot(snapshot.get(), data);
}
} else {
auto* first_snapshot = BoundingSnapshot(p->id(), Bound::kFirst);
auto* last_snapshot = BoundingSnapshot(p->id(), Bound::kLast);
if (first_snapshot) {
if (first_snapshot != last_snapshot) {
auto gc_limit = now.AddSeconds(-p->options().retention_duration_sec());
if (first_snapshot->snapshot_hybrid_time() < gc_limit) {
TryDeleteSnapshot(first_snapshot, data);
auto& index = snapshots_.get<ScheduleTag>();
auto range = index.equal_range(p->id());
if (range.first != range.second) {
--range.second;
auto& first_snapshot = **range.first;
data->schedule_min_restore_time[p->id()] =
first_snapshot.previous_snapshot_hybrid_time()
? first_snapshot.previous_snapshot_hybrid_time()
: first_snapshot.snapshot_hybrid_time();
auto gc_limit = now.AddSeconds(-p->options().retention_duration_sec());
for (; range.first != range.second; ++range.first) {
if ((**range.first).snapshot_hybrid_time() >= gc_limit) {
break;
}
TryDeleteSnapshot(range.first->get(), data);
}
data->schedule_min_restore_time[p->id()] =
first_snapshot->previous_snapshot_hybrid_time()
? first_snapshot->previous_snapshot_hybrid_time()
: first_snapshot->snapshot_hybrid_time();
last_snapshot_time = (**range.second).snapshot_hybrid_time();
}
last_snapshot_time = last_snapshot ? last_snapshot->snapshot_hybrid_time()
: HybridTime::kInvalid;
}
p->PrepareOperations(last_snapshot_time, now, &data->schedule_operations);
}
Expand Down
16 changes: 15 additions & 1 deletion src/yb/master/restoration_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,24 @@
namespace yb {
namespace master {

namespace {

std::string MakeRestorationStateLogPrefix(
const TxnSnapshotRestorationId& restoration_id, SnapshotState* snapshot) {
if (snapshot->schedule_id()) {
return Format("Restoration[$0/$1/$2]: ",
restoration_id, snapshot->id(), snapshot->schedule_id());
}
return Format("Restoration[$0/$1]: ", restoration_id, snapshot->id());
}

} // namespace

RestorationState::RestorationState(
SnapshotCoordinatorContext* context, const TxnSnapshotRestorationId& restoration_id,
SnapshotState* snapshot)
: StateWithTablets(context, SysSnapshotEntryPB::RESTORING),
: StateWithTablets(context, SysSnapshotEntryPB::RESTORING,
MakeRestorationStateLogPrefix(restoration_id, snapshot)),
restoration_id_(restoration_id), snapshot_id_(snapshot->id()) {
InitTabletIds(snapshot->TabletIdsInState(SysSnapshotEntryPB::COMPLETE));
}
Expand Down
30 changes: 28 additions & 2 deletions src/yb/master/snapshot_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,24 @@ Result<docdb::KeyBytes> EncodedSnapshotKey(
return EncodedKey(SysRowEntry::SNAPSHOT, id.AsSlice(), context);
}

namespace {

std::string MakeSnapshotStateLogPrefix(
const TxnSnapshotId& id, const std::string& schedule_id_str) {
auto schedule_id = TryFullyDecodeSnapshotScheduleId(schedule_id_str);
if (schedule_id) {
return Format("Snapshot[$0/$1]: ", id, schedule_id);
}
return Format("Snapshot[$0]: ", id);
}

} // namespace

SnapshotState::SnapshotState(
SnapshotCoordinatorContext* context, const TxnSnapshotId& id,
const tserver::TabletSnapshotOpRequestPB& request)
: StateWithTablets(context, SysSnapshotEntryPB::CREATING),
: StateWithTablets(context, SysSnapshotEntryPB::CREATING,
MakeSnapshotStateLogPrefix(id, request.schedule_id())),
id_(id), snapshot_hybrid_time_(request.snapshot_hybrid_time()),
previous_snapshot_hybrid_time_(HybridTime::FromPB(request.previous_snapshot_hybrid_time())),
schedule_id_(TryFullyDecodeSnapshotScheduleId(request.schedule_id())), version_(1) {
Expand All @@ -55,7 +69,8 @@ SnapshotState::SnapshotState(
SnapshotState::SnapshotState(
SnapshotCoordinatorContext* context, const TxnSnapshotId& id,
const SysSnapshotEntryPB& entry)
: StateWithTablets(context, entry.state()),
: StateWithTablets(context, entry.state(),
MakeSnapshotStateLogPrefix(id, entry.schedule_id())),
id_(id), snapshot_hybrid_time_(entry.snapshot_hybrid_time()),
previous_snapshot_hybrid_time_(HybridTime::FromPB(entry.previous_snapshot_hybrid_time())),
schedule_id_(TryFullyDecodeSnapshotScheduleId(entry.schedule_id())),
Expand Down Expand Up @@ -186,5 +201,16 @@ Result<tablet::CreateSnapshotData> SnapshotState::SysCatalogSnapshotData(
};
}

Status SnapshotState::CheckDoneStatus(const Status& status) {
if (initial_state() != SysSnapshotEntryPB::DELETING) {
return status;
}
MasterError error(status);
if (error == MasterErrorPB::TABLET_NOT_RUNNING || error == MasterErrorPB::TABLE_NOT_RUNNING) {
return Status::OK();
}
return status;
}

} // namespace master
} // namespace yb
3 changes: 2 additions & 1 deletion src/yb/master/snapshot_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class SnapshotState : public StateWithTablets {

private:
bool IsTerminalFailure(const Status& status) override;
CHECKED_STATUS CheckDoneStatus(const Status& status) override;

TxnSnapshotId id_;
HybridTime snapshot_hybrid_time_;
Expand All @@ -98,7 +99,7 @@ class SnapshotState : public StateWithTablets {
// When snapshot is taken as a part of snapshot schedule schedule_id_ will contain this
// schedule id. Otherwise it will be nil.
SnapshotScheduleId schedule_id_;
int version_;
int64_t version_;
bool delete_started_ = false;
};

Expand Down
43 changes: 27 additions & 16 deletions src/yb/master/state_with_tablets.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ SysSnapshotEntryPB::State InitialStateToTerminalState(SysSnapshotEntryPB::State
} // namespace

StateWithTablets::StateWithTablets(
SnapshotCoordinatorContext* context, SysSnapshotEntryPB::State initial_state)
: context_(*context), initial_state_(initial_state) {
SnapshotCoordinatorContext* context, SysSnapshotEntryPB::State initial_state,
std::string log_prefix)
: context_(*context), initial_state_(initial_state), log_prefix_(std::move(log_prefix)) {
}

void StateWithTablets::InitTablets(
Expand Down Expand Up @@ -102,7 +103,8 @@ bool StateWithTablets::PassedSinceCompletion(const MonoDelta& duration) const {
}

if (complete_at_ == CoarseTimePoint()) {
YB_LOG_EVERY_N_SECS(DFATAL, 30) << "All tablets done but complete done was not set";
YB_LOG_EVERY_N_SECS(DFATAL, 30)
<< LogPrefix() << "All tablets done but complete done was not set";
return false;
}

Expand All @@ -120,31 +122,34 @@ std::vector<TabletId> StateWithTablets::TabletIdsInState(SysSnapshotEntryPB::Sta
return result;
}

void StateWithTablets::Done(const TabletId& tablet_id, const Status& status) {
VLOG(4) << __func__ << "(" << tablet_id << ", " << status << ")";
void StateWithTablets::Done(const TabletId& tablet_id, Status status) {
VLOG_WITH_PREFIX_AND_FUNC(4) << tablet_id << ", " << status;

auto it = tablets_.find(tablet_id);
if (it == tablets_.end()) {
LOG(DFATAL) << "Finished " << InitialStateName() << " snapshot at unknown tablet "
<< tablet_id << ": " << status;
LOG_WITH_PREFIX(DFATAL)
<< "Finished " << InitialStateName() << " at unknown tablet "
<< tablet_id << ": " << status;
return;
}
if (!it->running) {
LOG(DFATAL) << "Finished " << InitialStateName() << " snapshot at " << tablet_id
<< " that is not running and in state "
<< SysSnapshotEntryPB::State_Name(it->state) << ": " << status;
LOG_WITH_PREFIX(DFATAL)
<< "Finished " << InitialStateName() << " at " << tablet_id
<< " that is not running and in state " << SysSnapshotEntryPB::State_Name(it->state)
<< ": " << status;
return;
}
tablets_.modify(it, [](TabletData& data) { data.running = false; });
const auto& state = it->state;
if (state == initial_state_) {
status = CheckDoneStatus(status);
if (status.ok()) {
tablets_.modify(
it, [terminal_state = InitialStateToTerminalState(initial_state_)](TabletData& data) {
data.state = terminal_state;
});
LOG(INFO) << "Finished " << InitialStateName() << " snapshot at " << tablet_id << ", "
<< num_tablets_in_initial_state_ << " was running";
LOG_WITH_PREFIX(INFO) << "Finished " << InitialStateName() << " at " << tablet_id << ", "
<< num_tablets_in_initial_state_ << " was running";
} else {
auto full_status = status.CloneAndPrepend(
Format("Failed to $0 snapshot at $1", InitialStateName(), tablet_id));
Expand All @@ -155,17 +160,19 @@ void StateWithTablets::Done(const TabletId& tablet_id, const Status& status) {
}
data.last_error = full_status;
});
LOG(WARNING) << full_status << ", terminal: " << terminal << ", "
<< num_tablets_in_initial_state_ << " was running";
LOG_WITH_PREFIX(WARNING)
<< full_status << ", terminal: " << terminal << ", " << num_tablets_in_initial_state_
<< " was running";
if (!terminal) {
return;
}
}
--num_tablets_in_initial_state_;
CheckCompleteness();
} else {
LOG(DFATAL) << "Finished " << InitialStateName() << " snapshot at tablet " << tablet_id
<< " in a wrong state " << state << ": " << status;
LOG_WITH_PREFIX(DFATAL)
<< "Finished " << InitialStateName() << " at tablet " << tablet_id << " in a wrong state "
<< state << ": " << status;
}
}

Expand Down Expand Up @@ -215,5 +222,9 @@ void StateWithTablets::RemoveTablets(const std::vector<std::string>& tablet_ids)
}
}

const std::string& StateWithTablets::LogPrefix() const {
return log_prefix_;
}

} // namespace master
} // namespace yb
16 changes: 12 additions & 4 deletions src/yb/master/state_with_tablets.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ namespace master {
class StateWithTablets {
public:
StateWithTablets(
SnapshotCoordinatorContext* context, SysSnapshotEntryPB::State initial_state);
SnapshotCoordinatorContext* context, SysSnapshotEntryPB::State initial_state,
std::string log_prefix);

virtual ~StateWithTablets() = default;

Expand All @@ -60,7 +61,7 @@ class StateWithTablets {
bool AllTabletsDone() const;
bool PassedSinceCompletion(const MonoDelta& duration) const;
std::vector<TabletId> TabletIdsInState(SysSnapshotEntryPB::State state);
void Done(const TabletId& tablet_id, const Status& status);
void Done(const TabletId& tablet_id, Status status);
bool AllInState(SysSnapshotEntryPB::State state);
bool HasInState(SysSnapshotEntryPB::State state);
void SetInitialTabletsState(SysSnapshotEntryPB::State state);
Expand Down Expand Up @@ -124,15 +125,21 @@ class StateWithTablets {

void RemoveTablets(const std::vector<std::string>& tablet_ids);

virtual bool IsTerminalFailure(const Status& status) = 0;

auto tablet_ids() const {
auto lambda = [](const TabletData& data) { return data.id; };
return boost::make_iterator_range(
boost::make_transform_iterator(tablets_.begin(), lambda),
boost::make_transform_iterator(tablets_.end(), lambda));
}

const std::string& LogPrefix() const;

virtual bool IsTerminalFailure(const Status& status) = 0;

virtual CHECKED_STATUS CheckDoneStatus(const Status& status) {
return status;
}

protected:
struct TabletData {
TabletId id;
Expand Down Expand Up @@ -175,6 +182,7 @@ class StateWithTablets {

SnapshotCoordinatorContext& context_;
SysSnapshotEntryPB::State initial_state_;
const std::string log_prefix_;

Tablets tablets_;

Expand Down
Loading

0 comments on commit a43cde5

Please sign in to comment.