Skip to content

Commit

Permalink
[#25391] DocDB: Enable TSAN for RWCLock and fix potential deadlocks
Browse files Browse the repository at this point in the history
Summary:
Thread sanitizer does not understand `std::timed_mutex`, so it cannot detect related issue, such as lock order reversal.
Could use `std::mutex` when building for TSAN to turn on detection of related issues.

Also fixed all TSAN issues found after this change.
There are two most common patterns for lock order reversion:
1) Usually we acquire object write lock while holding catalog manager mutex_. But sometimes we acquire catalog manager mutex_ while holding object write lock.
Fixed by changing place where we acquire catalog manager mutex_. Doing it before acquiring object write lock. Or after object write lock is released.

2) Different write lock order for catalog objects.
Fixed by adjusting code to take locks in the object id order.
Jira: DB-14620

Test Plan: Jenkins

Reviewers: zdrudi, hsunder, xCluster, asrivastava, stiwary

Reviewed By: zdrudi, hsunder, asrivastava

Subscribers: stiwary, yql, esheng, ybase

Tags: #jenkins-ready

Differential Revision: https://phorge.dev.yugabyte.com/D40697
  • Loading branch information
spolitov committed Jan 10, 2025
1 parent 8a03731 commit 113fd2b
Show file tree
Hide file tree
Showing 45 changed files with 809 additions and 575 deletions.
6 changes: 3 additions & 3 deletions src/yb/client/client-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -817,11 +817,11 @@ TEST_F(ClientTest, TestKeyRangeUpperBoundFiltering) {
TEST_F(ClientTest, TestListTables) {
auto tables = ASSERT_RESULT(client_->ListTables("", true));
std::sort(tables.begin(), tables.end(), [](const YBTableName& n1, const YBTableName& n2) {
return n1.ToString() < n2.ToString();
return n1.ToString(/* include_id= */ false) < n2.ToString(/* include_id= */ false);
});
ASSERT_EQ(2 + master::kNumSystemTablesWithTxn, tables.size());
ASSERT_EQ(kTableName, tables[0]) << "Tables:" << AsString(tables);
ASSERT_EQ(kTable2Name, tables[1]) << "Tables:" << AsString(tables);
ASSERT_EQ(kTableName, tables[0]) << "Tables: " << AsString(tables);
ASSERT_EQ(kTable2Name, tables[1]) << "Tables: " << AsString(tables);
tables.clear();
tables = ASSERT_RESULT(client_->ListTables("testtb2"));
ASSERT_EQ(1, tables.size());
Expand Down
11 changes: 8 additions & 3 deletions src/yb/client/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,14 @@ const boost::optional<ReplicationInfoPB>& YBTable::replication_info() const {
}

std::string YBTable::ToString() const {
return Format(
"$0 $1 IndexInfo: $2 IndexMap $3", (IsIndex() ? "Index Table" : "Normal Table"), id(),
yb::ToString(index_info()), yb::ToString(index_map()));
std::string result = Format("$0 $1", (IsIndex() ? "Index" : "Table"), name());
if (info_->index_info) {
result += " index info: " + AsString(*info_->index_info);
}
if (!info_->index_map.empty()) {
result += " index map: " + AsString(info_->index_map);
}
return result;
}

const dockv::PartitionSchema& YBTable::partition_schema() const {
Expand Down
22 changes: 18 additions & 4 deletions src/yb/client/yb_table_name.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
#include "yb/master/master_util.h"
#include "yb/util/flags.h"

namespace yb {
namespace client {
namespace yb::client {

DEFINE_RUNTIME_bool(yb_system_namespace_readonly, true, "Set system keyspace read-only.");

Expand Down Expand Up @@ -131,5 +130,20 @@ void YBTableName::set_pgschema_name(const std::string& pgschema_name) {
pgschema_name_ = pgschema_name;
}

} // namespace client
} // namespace yb
std::string YBTableName::ToString(bool include_id) const {
std::string result;
if (has_namespace()) {
result += namespace_name_;
result += ".";
}
result += table_name_;
if (!pgschema_name_.empty()) {
result += Format(" [ysql_schema=$0]", pgschema_name_);
}
if (include_id && !table_id_.empty()) {
result += Format(" [$0]", table_id_);
}
return result;
}

} // namespace yb::client
4 changes: 1 addition & 3 deletions src/yb/client/yb_table_name.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,7 @@ class YBTableName {

bool is_redis_table() const;

std::string ToString() const {
return has_namespace() ? (namespace_name_ + '.' + table_name_) : table_name_;
}
std::string ToString(bool include_id = true) const;

void set_namespace_id(const std::string& namespace_id);
void set_namespace_name(const std::string& namespace_name);
Expand Down
4 changes: 2 additions & 2 deletions src/yb/integration-tests/minicluster-snapshot-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(TabletSplitting)) {
// Wait for the split to complete on master.
// The parent should still be running because we have cleanup is still disabled.
ASSERT_OK(WaitFor([&]() -> Result<bool> {
return VERIFY_RESULT(source_table->GetTablets(IncludeInactive::kTrue)).size() == 5;
return VERIFY_RESULT(source_table->GetTabletsIncludeInactive()).size() == 5;
}, 30s, "Wait for master split."));
auto after_master_split_timestamp = ASSERT_RESULT(GetCurrentTime()).ToInt64();

Expand All @@ -1042,7 +1042,7 @@ TEST_F(PgCloneTest, YB_DISABLE_TEST_IN_SANITIZERS(TabletSplitting)) {
// Enable cleanup of split parents and wait for the split parent to be deleted.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_skip_deleting_split_tablets) = false;
ASSERT_OK(WaitFor([&]() -> Result<bool> {
auto tablets = VERIFY_RESULT(source_table->GetTablets(IncludeInactive::kTrue));
auto tablets = VERIFY_RESULT(source_table->GetTabletsIncludeInactive());
for (auto& tablet : tablets) {
if (tablet->id() == split_tablet_id) {
return tablet->LockForRead()->is_hidden();
Expand Down
2 changes: 1 addition & 1 deletion src/yb/integration-tests/snapshot-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ class SnapshotTest : public YBMiniClusterTestBase<MiniCluster> {
const TxnSnapshotId& snapshot_id, const TableId& table_id) {
auto master_leader = VERIFY_RESULT(cluster_->GetLeaderMiniMaster());
auto table = VERIFY_RESULT(master_leader->catalog_manager_impl().GetTableById(table_id));
auto tablets = VERIFY_RESULT(table->GetTablets(master::IncludeInactive::kTrue));
auto tablets = VERIFY_RESULT(table->GetTabletsIncludeInactive());
auto* coordinator = down_cast<master::MasterSnapshotCoordinator*>(
&master_leader->master()->snapshot_coordinator());
for (const auto& tablet : tablets) {
Expand Down
11 changes: 9 additions & 2 deletions src/yb/integration-tests/xcluster/xcluster_ysql_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -404,14 +404,21 @@ Result<YBTableName> XClusterYsqlTestBase::GetYsqlTable(
return STATUS(IllegalState, "Failed listing tables");
}

if (!verify_schema_name) {
// There are could be multiple tables with the same name.
// So reverse tables to find the latest one.
// Since we are dealing with postgres tables, their id reflects postgres table ids, that
// are assigned in increasing order.
std::reverse(resp.mutable_tables()->begin(), resp.mutable_tables()->end());
}

// Now need to find the table and return it.
for (const auto& table : resp.tables()) {
// If !verify_table_name, just return the first table.
if (!verify_table_name ||
(table.name() == table_name && table.namespace_().name() == namespace_name)) {
// In case of a match, further check for match in schema_name.
if (!verify_schema_name || (!table.has_pgschema_name() && schema_name.empty()) ||
(table.has_pgschema_name() && table.pgschema_name() == schema_name)) {
if (!verify_schema_name || (table.pgschema_name() == schema_name)) {
YBTableName yb_table;
yb_table.set_table_id(table.id());
yb_table.set_table_name(table_name);
Expand Down
3 changes: 2 additions & 1 deletion src/yb/master/async_rpc_tasks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,7 @@ AsyncCreateReplica::AsyncCreateReplica(Master *master,
ThreadPool *callback_pool,
const string& permanent_uuid,
const TabletInfoPtr& tablet,
const TabletInfo::ReadLock& tablet_lock,
const std::vector<SnapshotScheduleId>& snapshot_schedules,
LeaderEpoch epoch,
CDCSDKSetRetentionBarriers cdc_sdk_set_retention_barriers)
Expand All @@ -729,7 +730,7 @@ AsyncCreateReplica::AsyncCreateReplica(Master *master,

auto table_lock = tablet->table()->LockForRead();
const auto& table_pb = table_lock->pb;
const auto& tablet_pb = tablet->metadata().dirty().pb;
const auto& tablet_pb = tablet_lock->pb;

req_.set_dest_uuid(permanent_uuid);
req_.set_table_id(tablet->table()->id());
Expand Down
1 change: 1 addition & 0 deletions src/yb/master/async_rpc_tasks.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ class AsyncCreateReplica : public RetrySpecificTSRpcTaskWithTable {
ThreadPool *callback_pool,
const std::string& permanent_uuid,
const TabletInfoPtr& tablet,
const TabletInfo::ReadLock& tablet_lock,
const std::vector<SnapshotScheduleId>& snapshot_schedules,
LeaderEpoch epoch,
CDCSDKSetRetentionBarriers cdc_sdk_set_retention_barriers);
Expand Down
2 changes: 1 addition & 1 deletion src/yb/master/backfill_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1116,7 +1116,7 @@ Status BackfillTable::UpdateIndexPermissionsForIndexes() {
}

Status BackfillTable::ClearCheckpointStateInTablets() {
auto tablets = VERIFY_RESULT(indexed_table_->GetTablets());
auto tablets = VERIFY_RESULT(indexed_table_->GetTablets(GetTabletsMode::kOrderByTabletId));
for (const auto& tablet : tablets) {
tablet->mutable_metadata()->StartMutation();
auto& pb = tablet->mutable_metadata()->mutable_dirty()->pb;
Expand Down
133 changes: 97 additions & 36 deletions src/yb/master/catalog_entity_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,34 @@
#include <string>

#include "yb/cdc/xcluster_types.h"

#include "yb/common/colocated_util.h"
#include "yb/common/common_consensus_util.h"
#include "yb/common/doc_hybrid_time.h"
#include "yb/common/schema.h"
#include "yb/dockv/partition.h"
#include "yb/common/schema_pbutil.h"
#include "yb/common/wire_protocol.h"

#include "yb/master/xcluster/master_xcluster_util.h"
#include "yb/master/xcluster_rpc_tasks.h"
#include "yb/consensus/opid_util.h"

#include "yb/dockv/partition.h"

#include "yb/gutil/map-util.h"

#include "yb/master/master_client.pb.h"
#include "yb/master/master_defaults.h"
#include "yb/master/master_error.h"
#include "yb/master/master_util.h"
#include "yb/master/ts_descriptor.h"
#include "yb/master/xcluster_rpc_tasks.h"
#include "yb/master/xcluster/master_xcluster_util.h"

#include "yb/gutil/map-util.h"
#include "yb/util/atomic.h"
#include "yb/util/flags/auto_flags.h"
#include "yb/util/format.h"
#include "yb/util/oid_generator.h"
#include "yb/util/status_format.h"
#include "yb/util/string_util.h"
#include "yb/util/flags/auto_flags.h"

using std::string;

Expand All @@ -70,8 +76,22 @@ DEFINE_RUNTIME_AUTO_bool(
"Whether to use the new schema for colocated tables based on the parent_table_id field.");
TAG_FLAG(use_parent_table_id_field, advanced);

namespace yb {
namespace master {
namespace yb::master {

namespace {

Result<TabletInfoPtr> PromoteTabletPointer(const std::weak_ptr<TabletInfo>& tablet) {
if (auto p = tablet.lock()) {
return p;
} else {
return STATUS(
IllegalState,
"Tablet objects backing this table have been freed. This table object is possibly stale, "
"from a previous load of the sys catalog.");
}
}

} // namespace

// ================================================================================================
// TabletReplica
Expand Down Expand Up @@ -183,18 +203,14 @@ class TabletInfo::LeaderChangeReporter {
Result<TSDescriptorPtr> old_leader_;
};

TabletInfo::TabletInfo(const scoped_refptr<TableInfo>& table, TabletId tablet_id)
TabletInfo::TabletInfo(const TableInfoPtr& table, TabletId tablet_id)
: tablet_id_(std::move(tablet_id)),
table_(table),
last_update_time_(MonoTime::Now()),
last_time_with_valid_leader_(MonoTime::Now()),
reported_schema_version_({}) {
// Have to pre-initialize to an empty map, in case of access before the first setter is called.
replica_locations_ = std::make_shared<TabletReplicaMap>();
last_time_with_valid_leader_(last_update_time_) {
}

TabletInfo::~TabletInfo() {
}
TabletInfo::~TabletInfo() = default;

void TabletInfo::SetReplicaLocations(
std::shared_ptr<TabletReplicaMap> replica_locations) {
Expand Down Expand Up @@ -608,8 +624,6 @@ Result<TabletWithSplitPartitions> TableInfo::FindSplittableHashPartitionForStatu

void TableInfo::AddStatusTabletViaSplitPartition(
TabletInfoPtr old_tablet, const dockv::Partition& partition, const TabletInfoPtr& new_tablet) {
std::lock_guard l(lock_);

const auto& new_dirty = new_tablet->metadata().dirty();
if (new_dirty.is_deleted()) {
return;
Expand All @@ -620,6 +634,7 @@ void TableInfo::AddStatusTabletViaSplitPartition(
partition.ToPB(old_partition);
old_lock.Commit();

std::lock_guard l(lock_);
tablets_.emplace(new_tablet->id(), new_tablet);

if (!new_dirty.is_hidden()) {
Expand Down Expand Up @@ -911,20 +926,30 @@ bool TableInfo::HasPartitions(const std::vector<PartitionKey> other) const {
return true;
}

Result<TabletInfos> TableInfo::GetTablets(IncludeInactive include_inactive) const {
Result<TabletInfos> TableInfo::GetTabletsIncludeInactive() const {
TabletInfos result;
SharedLock<decltype(lock_)> l(lock_);
if (include_inactive) {
result.reserve(tablets_.size());
for (const auto& [_, tablet_weak_ptr] : tablets_) {
result.push_back(VERIFY_RESULT(PromoteTabletPointer(tablet_weak_ptr)));
}
} else {
result.reserve(tablets_.size());
for (const auto& [_, tablet_weak_ptr] : tablets_) {
result.push_back(VERIFY_RESULT(PromoteTabletPointer(tablet_weak_ptr)));
}
return result;
}

Result<TabletInfos> TableInfo::GetTablets(GetTabletsMode mode) const {
TabletInfos result;
{
SharedLock<decltype(lock_)> l(lock_);
result.reserve(partitions_.size());
for (const auto& [_, tablet_weak_ptr] : partitions_) {
result.push_back(VERIFY_RESULT(PromoteTabletPointer(tablet_weak_ptr)));
}
}
if (mode == GetTabletsMode::kOrderByTabletId) {
std::sort(result.begin(), result.end(), [](const auto& lhs, const auto& rhs) {
return lhs->id() < rhs->id();
});
}
return result;
}

Expand Down Expand Up @@ -1130,17 +1155,6 @@ std::vector<TransactionId> TableInfo::EraseDdlTxnsWaitingForSchemaVersion(int sc
return txns;
}

Result<TabletInfoPtr> TableInfo::PromoteTabletPointer(const std::weak_ptr<TabletInfo>& tablet) {
if (auto p = tablet.lock()) {
return p;
} else {
return STATUS(
IllegalState,
"Tablet objects backing this table have been freed. This table object is possibly stale, "
"from a previous load of the sys catalog.");
}
}

bool TableInfo::IsUserCreated() const {
return IsUserCreated(LockForRead());
}
Expand Down Expand Up @@ -1405,6 +1419,9 @@ std::string CDCStreamInfo::ToString() const {
return Format(
"$0 [namespace=$1] {metadata=$2} ", id(), l->pb.namespace_id(), l->pb.ShortDebugString());
}
if (l->pb.table_id().empty()) {
return Format("$0 {metadata=$2} ", id(), l->pb.ShortDebugString());
}
return Format("$0 [table=$1] {metadata=$2} ", id(), l->pb.table_id(0), l->pb.ShortDebugString());
}

Expand Down Expand Up @@ -1608,5 +1625,49 @@ bool SnapshotInfo::IsDeleteInProgress() const {
return LockForRead()->is_deleting();
}

} // namespace master
} // namespace yb
TabletInfoPtr MakeTabletInfo(
const TableInfoPtr& table,
const TabletId& tablet_id) {
auto tablet = std::make_shared<TabletInfo>(
table, tablet_id.empty() ? GenerateObjectId() : tablet_id);

VLOG_WITH_FUNC(2)
<< "Table: " << table->ToString() << ", tablet: " << tablet->ToString();

tablet->mutable_metadata()->StartMutation();

return tablet;
}

void SetupTabletInfo(
TabletInfo& tablet,
const TableInfo& table,
const PartitionPB& partition,
SysTabletsEntryPB::State state) {
auto& metadata = tablet.mutable_metadata()->mutable_dirty()->pb;
metadata.set_state(state);
metadata.mutable_partition()->CopyFrom(partition);
metadata.set_table_id(table.id());
if (FLAGS_use_parent_table_id_field && !table.is_system()) {
tablet.SetTableIds({table.id()});
metadata.set_hosted_tables_mapped_by_parent_id(true);
} else {
// This is important: we are setting the first table id in the table_ids list
// to be the id of the original table that creates the tablet.
metadata.add_table_ids(table.id());
}

auto& cstate = *metadata.mutable_committed_consensus_state();
cstate.set_current_term(consensus::kMinimumTerm);
cstate.mutable_config()->set_opid_index(consensus::kInvalidOpIdIndex);
}

TabletInfoPtr CreateTabletInfo(
const TableInfoPtr& table, const PartitionPB& partition, SysTabletsEntryPB::State state,
const TabletId& tablet_id) {
auto tablet = MakeTabletInfo(table, tablet_id);
SetupTabletInfo(*tablet, *table, partition, state);
return tablet;
}

} // namespace yb::master
Loading

0 comments on commit 113fd2b

Please sign in to comment.