Skip to content

Commit

Permalink
Merge pull request ceph#44913 from benhanokh/safe_shutdown_v2
Browse files Browse the repository at this point in the history
OSD::Modify OSD Fast-Shutdown to work safely i.e. quiesce all activit…

Reviewed-by: Josh Durgin <[email protected]>
Reviewed-by: Adam Kupczyk <[email protected]>
  • Loading branch information
yuriw authored Mar 9, 2022
2 parents 36b0dc0 + 8d05255 commit c6da0c5
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 36 deletions.
13 changes: 13 additions & 0 deletions src/common/options/global.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -3266,6 +3266,13 @@ options:
slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
default: true
with_legacy: true
- name: osd_fast_shutdown_timeout
type: int
level: advanced
desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
default: 15
with_legacy: true
min: 0
- name: osd_fast_shutdown_notify_mon
type: bool
level: advanced
Expand Down Expand Up @@ -4937,6 +4944,12 @@ options:
This setting is used only when OSD is doing ``--mkfs``.
Next runs of OSD retrieve sharding from disk.
default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
- name: bluestore_qfsck_on_mount
type: bool
level: dev
desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
default: true
with_legacy: true
- name: bluestore_fsck_on_mount
type: bool
level: dev
Expand Down
3 changes: 2 additions & 1 deletion src/os/ObjectStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ class ObjectStore {
virtual bool needs_journal() = 0; //< requires a journal
virtual bool wants_journal() = 0; //< prefers a journal
virtual bool allows_journal() = 0; //< allows a journal

virtual void prepare_for_fast_shutdown() {}
virtual bool has_null_manager() { return false; }
// return store min allocation size, if applicable
virtual uint64_t get_min_alloc_size() const {
return 0;
Expand Down
3 changes: 3 additions & 0 deletions src/os/bluestore/BlueFS.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2507,6 +2507,9 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
}
#endif
_flush_bdev();
++log.seq_live;
dirty.seq_live = log.seq_live;
log.t.seq = log.seq_live;

super.memorized_layout = layout;
super.log_fnode = log_file->fnode;
Expand Down
58 changes: 32 additions & 26 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num)
}
}

//---------------------------------------------
bool BlueStore::has_null_manager()
{
return (fm && fm->is_null_manager());
}

int BlueStore::_mount()
{
dout(5) << __func__ << "NCB:: path " << path << dendl;

_kv_only = false;
if (cct->_conf->bluestore_fsck_on_mount) {
dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
Expand Down Expand Up @@ -7681,12 +7688,15 @@ int BlueStore::umount()
#endif
dout(20) << __func__ << " stopping kv thread" << dendl;
_kv_stop();
_shutdown_cache();
// skip cache cleanup step on fast shutdown
if (likely(!m_fast_shutdown)) {
_shutdown_cache();
}
dout(20) << __func__ << " closing" << dendl;
}

_close_db_and_around();
if (cct->_conf->bluestore_fsck_on_umount) {
// disable fsck on fast-shutdown
if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
if (rc < 0)
return rc;
Expand Down Expand Up @@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node(
return 0;
}

void BlueStore::prepare_for_fast_shutdown()
{
m_fast_shutdown = true;
}

int BlueStore::get_devices(set<string> *ls)
{
if (bdev) {
Expand Down Expand Up @@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
string key_prefix;
_key_encode_u64(pool_id, &key_prefix);
*out_per_pool_omap = per_pool_omap != OMAP_BULK;
if (*out_per_pool_omap) {
// stop calls after db was closed
if (*out_per_pool_omap && db) {
auto prefix = per_pool_omap == OMAP_PER_POOL ?
PREFIX_PERPOOL_OMAP :
PREFIX_PERPG_OMAP;
Expand Down Expand Up @@ -18344,11 +18360,10 @@ int BlueStore::store_allocator(Allocator* src_allocator)
return -1;
}
}

bluefs->compact_log();
// reuse previous file-allocation if exists
ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
bool overwrite_file = (ret == 0);
//derr << __func__ << "bluefs->open_for_write(" << overwrite_file << ")" << dendl;
BlueFS::FileWriter *p_handle = nullptr;
ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
if (ret != 0) {
Expand All @@ -18358,8 +18373,9 @@ int BlueStore::store_allocator(Allocator* src_allocator)

uint64_t file_size = p_handle->file->fnode.size;
uint64_t allocated = p_handle->file->fnode.get_allocated();
dout(5) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;

bluefs->sync_metadata(false);
unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
if (!allocator) {
bluefs->close_writer(p_handle);
Expand Down Expand Up @@ -18431,12 +18447,11 @@ int BlueStore::store_allocator(Allocator* src_allocator)
bluefs->fsync(p_handle);

utime_t duration = ceph_clock_now() - start_time;
dout(5) <<"WRITE-extent_count=" << extent_count << ", file_size=" << p_handle->file->fnode.size << dendl;
dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;

bluefs->close_writer(p_handle);
need_to_destage_allocation_file = false;
dout(10) << "need_to_destage_allocation_file was clear" << dendl;
return 0;
}

Expand Down Expand Up @@ -18628,7 +18643,7 @@ int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t
utime_t duration = ceph_clock_now() - start_time;
dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
<< read_alloc_size << ", file_size=" << file_size << dendl;
dout(5) << "READ duration=" << duration << " seconds, s_serial=" << s_serial << dendl;
dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
*num = extent_count;
*bytes = read_alloc_size;
return 0;
Expand Down Expand Up @@ -18923,7 +18938,7 @@ int BlueStore::read_allocation_from_drive_on_startup()

utime_t start = ceph_clock_now();
read_alloc_stats_t stats = {};
SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
ret = reconstruct_allocations(&sbmap, stats);
if (ret != 0) {
return ret;
Expand Down Expand Up @@ -19025,15 +19040,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t
return 0;
} else {
derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
std::cout << "===================================================================" << std::endl;
for (uint64_t i = 0; i < idx1; i++) {
std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl;
}

std::cout << "===================================================================" << std::endl;
for (uint64_t i = 0; i < idx2; i++) {
std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl;
}
return -1;
}
}
Expand Down Expand Up @@ -19081,9 +19087,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
utime_t start = ceph_clock_now();

auto shutdown_cache = make_scope_guard([&] {
std::cout << "Allocation Recovery was completed in " << duration
<< " seconds; insert_count=" << stats.insert_count
<< "; extent_count=" << stats.extent_count << std::endl;
dout(1) << "Allocation Recovery was completed in " << duration
<< " seconds; insert_count=" << stats.insert_count
<< "; extent_count=" << stats.extent_count << dendl;
_shutdown_cache();
_close_db_and_around();
});
Expand All @@ -19092,7 +19098,7 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
//reconstruct allocations into a temp simple-bitmap and copy into allocator
{
SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
ret = reconstruct_allocations(&sbmap, stats);
if (ret != 0) {
return ret;
Expand All @@ -19113,14 +19119,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
};
allocator->dump(count_entries);
ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
if (ret != 0) {
if (ret == 0) {
dout(5) << "Allocator drive - file integrity check OK" << dendl;
} else {
derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
}
}

std::cout << stats << std::endl;
dout(1) << stats << dendl;
return ret;
}

Expand Down
5 changes: 4 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -2764,7 +2764,7 @@ class BlueStore : public ObjectStore,

private:
int32_t ondisk_format = 0; ///< value detected on mount

bool m_fast_shutdown = false;
int _upgrade_super(); ///< upgrade (called during open_super)
uint64_t _get_ondisk_reserved() const;
void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
Expand All @@ -2783,6 +2783,9 @@ class BlueStore : public ObjectStore,
bool wants_journal() override { return false; };
bool allows_journal() override { return false; };

void prepare_for_fast_shutdown() override;
virtual bool has_null_manager();

uint64_t get_min_alloc_size() const override {
return min_alloc_size;
}
Expand Down
101 changes: 94 additions & 7 deletions src/osd/OSD.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4258,27 +4258,44 @@ PerfCounters* OSD::create_recoverystate_perf()

int OSD::shutdown()
{
// vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
//cct->_conf->osd_fast_shutdown = true;

dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
<< cct->_conf->osd_fast_shutdown
<< ", null-fm = " << store->has_null_manager() << dendl;

utime_t start_time_func = ceph_clock_now();

if (cct->_conf->osd_fast_shutdown) {
derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
if (cct->_conf->osd_fast_shutdown_notify_mon)
service.prepare_to_stop();
cct->_log->flush();
_exit(0);
}

if (!service.prepare_to_stop())
// There is no state we need to keep wehn running in NULL-FM moode
if (!store->has_null_manager()) {
cct->_log->flush();
_exit(0);
}
} else if (!service.prepare_to_stop()) {
return 0; // already shutting down
}

osd_lock.lock();
if (is_stopping()) {
osd_lock.unlock();
return 0;
}
dout(0) << "shutdown" << dendl;

if (!cct->_conf->osd_fast_shutdown) {
dout(0) << "shutdown" << dendl;
}

// don't accept new task for this OSD
set_state(STATE_STOPPING);

// Debugging
if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
// Disabled debugging during fast-shutdown
if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
cct->_conf.set_val("debug_osd", "100");
cct->_conf.set_val("debug_journal", "100");
cct->_conf.set_val("debug_filestore", "100");
Expand All @@ -4287,6 +4304,45 @@ int OSD::shutdown()
cct->_conf.apply_changes(nullptr);
}

if (cct->_conf->osd_fast_shutdown) {
// first, stop new task from being taken from op_shardedwq
// and clear all pending tasks
op_shardedwq.stop_for_fast_shutdown();

utime_t start_time_timer = ceph_clock_now();
tick_timer.shutdown();
{
std::lock_guard l(tick_timer_lock);
tick_timer_without_osd_lock.shutdown();
}

osd_lock.unlock();
utime_t start_time_osd_drain = ceph_clock_now();

// then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
osd_op_tp.drain();
osd_op_tp.stop();

utime_t start_time_umount = ceph_clock_now();
store->prepare_for_fast_shutdown();
std::lock_guard lock(osd_lock);
// TBD: assert in allocator that nothing is being add
store->umount();

utime_t end_time = ceph_clock_now();
if (cct->_conf->osd_fast_shutdown_timeout) {
ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
}
dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
cct->_log->flush();

// now it is safe to exit
_exit(0);
}

// stop MgrClient earlier as it's more like an internal consumer of OSD
mgrc.shutdown();

Expand Down Expand Up @@ -4448,6 +4504,9 @@ int OSD::shutdown()
hb_front_server_messenger->shutdown();
hb_back_server_messenger->shutdown();

utime_t duration = ceph_clock_now() - start_time_func;
dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;

tracing::osd::tracer.shutdown();

return r;
Expand Down Expand Up @@ -11072,6 +11131,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
}

void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
if (unlikely(m_fast_shutdown) ) {
// stop enqueing when we are in the middle of a fast shutdown
return;
}

uint32_t shard_index =
item.get_ordering_token().hash_to_shard(osd->shards.size());

Expand Down Expand Up @@ -11102,6 +11166,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {

void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
{
if (unlikely(m_fast_shutdown) ) {
// stop enqueing when we are in the middle of a fast shutdown
return;
}

auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
auto& sdata = osd->shards[shard_index];
ceph_assert(sdata);
Expand All @@ -11128,6 +11197,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
sdata->sdata_cond.notify_one();
}

void OSD::ShardedOpWQ::stop_for_fast_shutdown()
{
uint32_t shard_index = 0;
m_fast_shutdown = true;

for (; shard_index < osd->num_shards; shard_index++) {
auto& sdata = osd->shards[shard_index];
ceph_assert(sdata);
sdata->shard_lock.lock();
int work_count = 0;
while(! sdata->scheduler->empty() ) {
auto work_item = sdata->scheduler->dequeue();
work_count++;
}
sdata->shard_lock.unlock();
}
}

namespace ceph::osd_cmds {

int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
Expand Down
Loading

0 comments on commit c6da0c5

Please sign in to comment.