-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathOSDMonitor.h
578 lines (496 loc) · 18.5 KB
/
OSDMonitor.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
/*
* Ceph - scalable distributed file system
*
* Copyright (C) 2004-2006 Sage Weil <[email protected]>
* Copyright (C) 2013,2014 Cloudwatt <[email protected]>
*
* Author: Loic Dachary <[email protected]>
*
* This is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2.1, as published by the Free Software
* Foundation. See file COPYING.
*
*/
/* Object Store Device (OSD) Monitor
*/
#ifndef CEPH_OSDMONITOR_H
#define CEPH_OSDMONITOR_H
#include <map>
#include <set>
#include "include/types.h"
#include "common/simple_cache.hpp"
#include "msg/Messenger.h"
#include "osd/OSDMap.h"
#include "osd/OSDMapMapping.h"
#include "CreatingPGs.h"
#include "PaxosService.h"
class Monitor;
class PGMap;
class MonSession;
class MOSDMap;
#include "erasure-code/ErasureCodeInterface.h"
#include "mon/MonOpRequest.h"
/// information about a particular peer's failure reports for one osd
struct failure_reporter_t {
utime_t failed_since; ///< when they think it failed
MonOpRequestRef op; ///< failure op request
failure_reporter_t() {}
explicit failure_reporter_t(utime_t s) : failed_since(s) {}
~failure_reporter_t() { }
};
/// information about all failure reports for one osd
struct failure_info_t {
map<int, failure_reporter_t> reporters; ///< reporter -> failed_since etc
utime_t max_failed_since; ///< most recent failed_since
failure_info_t() {}
utime_t get_failed_since() {
if (max_failed_since == utime_t() && !reporters.empty()) {
// the old max must have canceled; recalculate.
for (map<int, failure_reporter_t>::iterator p = reporters.begin();
p != reporters.end();
++p)
if (p->second.failed_since > max_failed_since)
max_failed_since = p->second.failed_since;
}
return max_failed_since;
}
// set the message for the latest report. return any old op request we had,
// if any, so we can discard it.
MonOpRequestRef add_report(int who, utime_t failed_since,
MonOpRequestRef op) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end()) {
if (max_failed_since < failed_since)
max_failed_since = failed_since;
p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
}
MonOpRequestRef ret = p->second.op;
p->second.op = op;
return ret;
}
void take_report_messages(list<MonOpRequestRef>& ls) {
for (map<int, failure_reporter_t>::iterator p = reporters.begin();
p != reporters.end();
++p) {
if (p->second.op) {
ls.push_back(p->second.op);
p->second.op.reset();
}
}
}
MonOpRequestRef cancel_report(int who) {
map<int, failure_reporter_t>::iterator p = reporters.find(who);
if (p == reporters.end())
return MonOpRequestRef();
MonOpRequestRef ret = p->second.op;
reporters.erase(p);
return ret;
}
};
class LastEpochClean {
struct Lec {
vector<epoch_t> epoch_by_pg;
ps_t next_missing = 0;
epoch_t floor = std::numeric_limits<epoch_t>::max();
void report(ps_t pg, epoch_t last_epoch_clean);
};
std::map<uint64_t, Lec> report_by_pool;
public:
void report(const pg_t& pg, epoch_t last_epoch_clean);
void remove_pool(uint64_t pool);
epoch_t get_lower_bound(const OSDMap& latest) const;
};
class OSDMonitor : public PaxosService {
CephContext *cct;
public:
OSDMap osdmap;
// [leader]
OSDMap::Incremental pending_inc;
map<int, bufferlist> pending_metadata;
set<int> pending_metadata_rm;
map<int, failure_info_t> failure_info;
map<int,utime_t> down_pending_out; // osd down -> out
map<int,double> osd_weight;
SimpleLRU<version_t, bufferlist> inc_osd_cache;
SimpleLRU<version_t, bufferlist> full_osd_cache;
bool check_failures(utime_t now);
bool check_failure(utime_t now, int target_osd, failure_info_t& fi);
void force_failure(int target_osd, int by);
// the time of last msg(MSG_ALIVE and MSG_PGTEMP) proposed without delay
utime_t last_attempted_minwait_time;
bool _have_pending_crush();
CrushWrapper &_get_stable_crush();
void _get_pending_crush(CrushWrapper& newcrush);
enum FastReadType {
FAST_READ_OFF,
FAST_READ_ON,
FAST_READ_DEFAULT
};
// svc
public:
void create_initial() override;
void get_store_prefixes(std::set<string>& s) const override;
private:
void update_from_paxos(bool *need_bootstrap) override;
void create_pending() override; // prepare a new pending
void encode_pending(MonitorDBStore::TransactionRef t) override;
void on_active() override;
void on_restart() override;
void on_shutdown() override;
/**
* we haven't delegated full version stashing to paxosservice for some time
* now, making this function useless in current context.
*/
void encode_full(MonitorDBStore::TransactionRef t) override { }
/**
* do not let paxosservice periodically stash full osdmaps, or we will break our
* locally-managed full maps. (update_from_paxos loads the latest and writes them
* out going forward from there, but if we just synced that may mean we skip some.)
*/
bool should_stash_full() override {
return false;
}
/**
* hook into trim to include the oldest full map in the trim transaction
*
* This ensures that anyone post-sync will have enough to rebuild their
* full osdmaps.
*/
void encode_trim_extra(MonitorDBStore::TransactionRef tx, version_t first) override;
void update_msgr_features();
int check_cluster_features(uint64_t features, stringstream &ss);
/**
* check if the cluster supports the features required by the
* given crush map. Outputs the daemons which don't support it
* to the stringstream.
*
* @returns true if the map is passable, false otherwise
*/
bool validate_crush_against_features(const CrushWrapper *newcrush,
stringstream &ss);
void check_osdmap_subs();
void share_map_with_random_osd();
Mutex prime_pg_temp_lock = {"OSDMonitor::prime_pg_temp_lock"};
struct PrimeTempJob : public ParallelPGMapper::Job {
OSDMonitor *osdmon;
PrimeTempJob(const OSDMap& om, OSDMonitor *m)
: ParallelPGMapper::Job(&om), osdmon(m) {}
void process(int64_t pool, unsigned ps_begin, unsigned ps_end) override {
for (unsigned ps = ps_begin; ps < ps_end; ++ps) {
pg_t pgid(ps, pool);
osdmon->prime_pg_temp(*osdmap, pgid);
}
}
void complete() override {}
};
void maybe_prime_pg_temp();
void prime_pg_temp(const OSDMap& next, pg_t pgid);
ParallelPGMapper mapper; ///< for background pg work
OSDMapMapping mapping; ///< pg <-> osd mappings
unique_ptr<ParallelPGMapper::Job> mapping_job; ///< background mapping job
void start_mapping();
void update_logger();
void handle_query(PaxosServiceMessage *m);
bool preprocess_query(MonOpRequestRef op) override; // true if processed.
bool prepare_update(MonOpRequestRef op) override;
bool should_propose(double &delay) override;
version_t get_trim_to() const override;
bool can_mark_down(int o);
bool can_mark_up(int o);
bool can_mark_out(int o);
bool can_mark_in(int o);
// ...
MOSDMap *build_latest_full();
MOSDMap *build_incremental(epoch_t first, epoch_t last);
void send_full(MonOpRequestRef op);
void send_incremental(MonOpRequestRef op, epoch_t first);
public:
// @param req an optional op request, if the osdmaps are replies to it. so
// @c Monitor::send_reply() can mark_event with it.
void send_incremental(epoch_t first, MonSession *session, bool onetime,
MonOpRequestRef req = MonOpRequestRef());
private:
void print_utilization(ostream &out, Formatter *f, bool tree) const;
bool check_source(PaxosServiceMessage *m, uuid_d fsid);
bool preprocess_get_osdmap(MonOpRequestRef op);
bool preprocess_mark_me_down(MonOpRequestRef op);
friend class C_AckMarkedDown;
bool preprocess_failure(MonOpRequestRef op);
bool prepare_failure(MonOpRequestRef op);
bool prepare_mark_me_down(MonOpRequestRef op);
void process_failures();
void take_all_failures(list<MonOpRequestRef>& ls);
bool preprocess_full(MonOpRequestRef op);
bool prepare_full(MonOpRequestRef op);
bool preprocess_boot(MonOpRequestRef op);
bool prepare_boot(MonOpRequestRef op);
void _booted(MonOpRequestRef op, bool logit);
void update_up_thru(int from, epoch_t up_thru);
bool preprocess_alive(MonOpRequestRef op);
bool prepare_alive(MonOpRequestRef op);
void _reply_map(MonOpRequestRef op, epoch_t e);
bool preprocess_pgtemp(MonOpRequestRef op);
bool prepare_pgtemp(MonOpRequestRef op);
bool preprocess_pg_created(MonOpRequestRef op);
bool prepare_pg_created(MonOpRequestRef op);
int _check_remove_pool(int64_t pool_id, const pg_pool_t &pool, ostream *ss);
bool _check_become_tier(
int64_t tier_pool_id, const pg_pool_t *tier_pool,
int64_t base_pool_id, const pg_pool_t *base_pool,
int *err, ostream *ss) const;
bool _check_remove_tier(
int64_t base_pool_id, const pg_pool_t *base_pool, const pg_pool_t *tier_pool,
int *err, ostream *ss) const;
int _prepare_remove_pool(int64_t pool, ostream *ss, bool no_fake);
int _prepare_rename_pool(int64_t pool, string newname);
bool preprocess_pool_op (MonOpRequestRef op);
bool preprocess_pool_op_create (MonOpRequestRef op);
bool prepare_pool_op (MonOpRequestRef op);
bool prepare_pool_op_create (MonOpRequestRef op);
bool prepare_pool_op_delete(MonOpRequestRef op);
int crush_rename_bucket(const string& srcname,
const string& dstname,
ostream *ss);
void check_legacy_ec_plugin(const string& plugin,
const string& profile) const;
int normalize_profile(const string& profilename,
ErasureCodeProfile &profile,
bool force,
ostream *ss);
int crush_rule_create_erasure(const string &name,
const string &profile,
int *rule,
ostream *ss);
int get_crush_rule(const string &rule_name,
int *crush_rule,
ostream *ss);
int get_erasure_code(const string &erasure_code_profile,
ErasureCodeInterfaceRef *erasure_code,
ostream *ss) const;
int prepare_pool_crush_rule(const unsigned pool_type,
const string &erasure_code_profile,
const string &rule_name,
int *crush_rule,
ostream *ss);
bool erasure_code_profile_in_use(
const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
const string &profile,
ostream *ss);
int parse_erasure_code_profile(const vector<string> &erasure_code_profile,
map<string,string> *erasure_code_profile_map,
ostream *ss);
int prepare_pool_size(const unsigned pool_type,
const string &erasure_code_profile,
unsigned *size, unsigned *min_size,
ostream *ss);
int prepare_pool_stripe_width(const unsigned pool_type,
const string &erasure_code_profile,
unsigned *stripe_width,
ostream *ss);
int check_pg_num(int64_t pool, int pg_num, int size, ostream* ss);
int prepare_new_pool(string& name, uint64_t auid,
int crush_rule,
const string &crush_rule_name,
unsigned pg_num, unsigned pgp_num,
const string &erasure_code_profile,
const unsigned pool_type,
const uint64_t expected_num_objects,
FastReadType fast_read,
ostream *ss);
int prepare_new_pool(MonOpRequestRef op);
void set_pool_flags(int64_t pool_id, uint64_t flags);
void clear_pool_flags(int64_t pool_id, uint64_t flags);
bool update_pools_status();
string make_snap_epoch_key(int64_t pool, epoch_t epoch);
string make_snap_key(int64_t pool, snapid_t snap);
string make_snap_key_value(int64_t pool, snapid_t snap, snapid_t num,
epoch_t epoch, bufferlist *v);
string make_snap_purged_key(int64_t pool, snapid_t snap);
string make_snap_purged_key_value(int64_t pool, snapid_t snap, snapid_t num,
epoch_t epoch, bufferlist *v);
bool try_prune_purged_snaps();
int lookup_pruned_snap(int64_t pool, snapid_t snap,
snapid_t *begin, snapid_t *end);
bool prepare_set_flag(MonOpRequestRef op, int flag);
bool prepare_unset_flag(MonOpRequestRef op, int flag);
void _pool_op_reply(MonOpRequestRef op,
int ret, epoch_t epoch, bufferlist *blp=NULL);
struct C_Booted : public C_MonOp {
OSDMonitor *cmon;
bool logit;
C_Booted(OSDMonitor *cm, MonOpRequestRef op_, bool l=true) :
C_MonOp(op_), cmon(cm), logit(l) {}
void _finish(int r) override {
if (r >= 0)
cmon->_booted(op, logit);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
cmon->dispatch(op);
else
assert(0 == "bad C_Booted return value");
}
};
struct C_ReplyMap : public C_MonOp {
OSDMonitor *osdmon;
epoch_t e;
C_ReplyMap(OSDMonitor *o, MonOpRequestRef op_, epoch_t ee)
: C_MonOp(op_), osdmon(o), e(ee) {}
void _finish(int r) override {
if (r >= 0)
osdmon->_reply_map(op, e);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
osdmon->dispatch(op);
else
assert(0 == "bad C_ReplyMap return value");
}
};
struct C_PoolOp : public C_MonOp {
OSDMonitor *osdmon;
int replyCode;
int epoch;
bufferlist reply_data;
C_PoolOp(OSDMonitor * osd, MonOpRequestRef op_, int rc, int e, bufferlist *rd=NULL) :
C_MonOp(op_), osdmon(osd), replyCode(rc), epoch(e) {
if (rd)
reply_data = *rd;
}
void _finish(int r) override {
if (r >= 0)
osdmon->_pool_op_reply(op, replyCode, epoch, &reply_data);
else if (r == -ECANCELED)
return;
else if (r == -EAGAIN)
osdmon->dispatch(op);
else
assert(0 == "bad C_PoolOp return value");
}
};
bool preprocess_remove_snaps(MonOpRequestRef op);
bool prepare_remove_snaps(MonOpRequestRef op);
OpTracker op_tracker;
int load_metadata(int osd, map<string, string>& m, ostream *err);
void count_metadata(const string& field, Formatter *f);
public:
void count_metadata(const string& field, map<string,int> *out);
protected:
int get_osd_objectstore_type(int osd, std::string *type);
bool is_pool_currently_all_bluestore(int64_t pool_id, const pg_pool_t &pool,
ostream *err);
// when we last received PG stats from each osd
map<int,utime_t> last_osd_report;
// TODO: use last_osd_report to store the osd report epochs, once we don't
// need to upgrade from pre-luminous releases.
map<int,epoch_t> osd_epochs;
LastEpochClean last_epoch_clean;
bool preprocess_beacon(MonOpRequestRef op);
bool prepare_beacon(MonOpRequestRef op);
epoch_t get_min_last_epoch_clean() const;
friend class C_UpdateCreatingPGs;
std::map<int, std::map<epoch_t, std::set<pg_t>>> creating_pgs_by_osd_epoch;
std::vector<pg_t> pending_created_pgs;
// the epoch when the pg mapping was calculated
epoch_t creating_pgs_epoch = 0;
creating_pgs_t creating_pgs;
mutable std::mutex creating_pgs_lock;
creating_pgs_t update_pending_pgs(const OSDMap::Incremental& inc,
const OSDMap& nextmap);
unsigned scan_for_creating_pgs(
const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
const mempool::osdmap::set<int64_t>& removed_pools,
utime_t modified,
creating_pgs_t* creating_pgs) const;
pair<int32_t, pg_t> get_parent_pg(pg_t pgid) const;
void update_creating_pgs();
void check_pg_creates_subs();
epoch_t send_pg_creates(int osd, Connection *con, epoch_t next) const;
int32_t _allocate_osd_id(int32_t* existing_id);
public:
OSDMonitor(CephContext *cct, Monitor *mn, Paxos *p, const string& service_name);
void tick() override; // check state, take actions
bool preprocess_command(MonOpRequestRef op);
bool prepare_command(MonOpRequestRef op);
bool prepare_command_impl(MonOpRequestRef op, const cmdmap_t& cmdmap);
int validate_osd_create(
const int32_t id,
const uuid_d& uuid,
const bool check_osd_exists,
int32_t* existing_id,
stringstream& ss);
int prepare_command_osd_create(
const int32_t id,
const uuid_d& uuid,
int32_t* existing_id,
stringstream& ss);
void do_osd_create(const int32_t id, const uuid_d& uuid,
const string& device_class,
int32_t* new_id);
int prepare_command_osd_purge(int32_t id, stringstream& ss);
int prepare_command_osd_destroy(int32_t id, stringstream& ss);
int _prepare_command_osd_crush_remove(
CrushWrapper &newcrush,
int32_t id,
int32_t ancestor,
bool has_ancestor,
bool unlink_only);
void do_osd_crush_remove(CrushWrapper& newcrush);
int prepare_command_osd_crush_remove(
CrushWrapper &newcrush,
int32_t id,
int32_t ancestor,
bool has_ancestor,
bool unlink_only);
int prepare_command_osd_remove(int32_t id);
int prepare_command_osd_new(
MonOpRequestRef op,
const cmdmap_t& cmdmap,
const map<string,string>& secrets,
stringstream &ss,
Formatter *f);
int prepare_command_pool_set(const cmdmap_t& cmdmap,
stringstream& ss);
int prepare_command_pool_application(const string &prefix,
const cmdmap_t& cmdmap,
stringstream& ss);
bool handle_osd_timeouts(const utime_t &now,
std::map<int,utime_t> &last_osd_report);
void send_latest(MonOpRequestRef op, epoch_t start=0);
void send_latest_now_nodelete(MonOpRequestRef op, epoch_t start=0) {
op->mark_osdmon_event(__func__);
send_incremental(op, start);
}
void get_removed_snaps_range(
epoch_t start, epoch_t end,
mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps);
int get_version(version_t ver, bufferlist& bl) override;
int get_version_full(version_t ver, bufferlist& bl) override;
epoch_t blacklist(const entity_addr_t& a, utime_t until);
void dump_info(Formatter *f);
int dump_osd_metadata(int osd, Formatter *f, ostream *err);
void print_nodes(Formatter *f);
void check_osdmap_sub(Subscription *sub);
void check_pg_creates_sub(Subscription *sub);
void do_application_enable(int64_t pool_id, const std::string &app_name,
const std::string &app_key="",
const std::string &app_value="");
void add_flag(int flag) {
if (!(osdmap.flags & flag)) {
if (pending_inc.new_flags < 0)
pending_inc.new_flags = osdmap.flags;
pending_inc.new_flags |= flag;
}
}
void remove_flag(int flag) {
if(osdmap.flags & flag) {
if (pending_inc.new_flags < 0)
pending_inc.new_flags = osdmap.flags;
pending_inc.new_flags &= ~flag;
}
}
};
#endif