From a5ce9c3863f98992193a9913344a046112c28dea Mon Sep 17 00:00:00 2001 From: Samuel Just Date: Fri, 26 Jan 2024 12:23:03 -0800 Subject: [PATCH] Revert "crush: add multistep retry rules" This PR was merged by accident before it was ready. Let's revert for now and open a new PR. Signed-off-by: Samuel Just --- doc/rados/operations/crush-map-edits.rst | 30 +- doc/rados/operations/crush-map.rst | 22 - .../ec-rados-plugin=jerasure-k=4-m=2.yaml | 4 +- .../dashboard/test_erasure_code_profile.py | 2 +- src/crush/CrushCompiler.cc | 68 +- src/crush/CrushWrapper.cc | 190 +-- src/crush/CrushWrapper.h | 113 +- src/crush/crush.h | 23 +- src/crush/grammar.h | 16 +- src/crush/mapper.c | 1070 +---------------- src/crush/mapper.h | 14 +- src/erasure-code/ErasureCode.cc | 46 +- src/erasure-code/ErasureCode.h | 2 - src/include/ceph_features.h | 6 +- src/mon/OSDMonitor.cc | 12 +- src/osd/OSDMap.cc | 12 +- src/test/cli/crushtool/choose-args.t | 5 +- src/test/cli/osdmaptool/crush.t | 2 +- src/test/crush/crush.cc | 1044 ++-------------- src/vstart.sh | 34 - 20 files changed, 192 insertions(+), 2523 deletions(-) diff --git a/doc/rados/operations/crush-map-edits.rst b/doc/rados/operations/crush-map-edits.rst index 22e7e2f3772f3..46a4a4f74e873 100644 --- a/doc/rados/operations/crush-map-edits.rst +++ b/doc/rados/operations/crush-map-edits.rst @@ -419,7 +419,7 @@ centers for three-way replication, and yet another rule for erasure coding acros six storage devices. For a detailed discussion of CRUSH rules, see **Section 3.2** of `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_. -A normal CRUSH rule takes the following form:: +A rule takes the following form:: rule { @@ -430,18 +430,6 @@ A normal CRUSH rule takes the following form:: step emit } -CRUSH MSR rules are a distinct type of CRUSH rule which supports retrying steps -and provides better support for configurations that require multiple OSDs within -each failure domain. MSR rules take the following form:: - - rule { - - id [a unique integer ID] - type [msr_indep|msr_firsn] - step take [class ] - step choosemsr type - step emit - } ``id`` :Description: A unique integer that identifies the rule. @@ -453,14 +441,12 @@ each failure domain. MSR rules take the following form:: ``type`` :Description: Denotes the type of replication strategy to be enforced by the - rule. msr_firstn and msr_indep are a distinct descent algorithm - which supports retrying steps within the rule and therefore - multiple OSDs per failure domain. + rule. :Purpose: A component of the rule mask. :Type: String :Required: Yes :Default: ``replicated`` - :Valid Values: ``replicated``, ``erasure``, ``msr_firstn``, ``msr_indep`` + :Valid Values: ``replicated`` or ``erasure`` ``step take [class ]`` @@ -539,16 +525,6 @@ each failure domain. MSR rules take the following form:: final CRUSH mapping transformation is therefore 1, 2, 3, 4, 5 → 1, 2, 6, 4, 5. -``step choosemsr {num} type {bucket-type}`` - :Description: Selects a num buckets of type bucket-type. msr_firstn and msr_indep - must use choosemsr rather than choose or chooseleaf. - - - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (as many buckets as are available). - - If ``pool-num-replicas > {num} > 0``, choose that many buckets. - :Purpose: Choose step required for msr_firstn and msr_indep rules. - :Prerequisite: Follows ``step take`` and precedes ``step emit`` - :Example: ``step choosemsr 3 type host`` - .. _crush-reclassify: Migrating from a legacy SSD rule to device classes diff --git a/doc/rados/operations/crush-map.rst b/doc/rados/operations/crush-map.rst index e18d593253d9d..39151e6d4a766 100644 --- a/doc/rados/operations/crush-map.rst +++ b/doc/rados/operations/crush-map.rst @@ -709,13 +709,6 @@ The relevant erasure-code profile properties are as follows: [default: ``default``]. * **crush-failure-domain**: the CRUSH bucket type used in the distribution of erasure-coded shards [default: ``host``]. - * **crush-osds-per-failure-domain**: Maximum number of OSDs to place in each - failure domain -- defaults to 1. Using a value greater than one will - cause a CRUSH MSR rule to be created, see below. Must be specified if - crush-num-failure-domains is specified. - * **crush-num-failure-domains**: Number of failure domains to map. Must be - specified if crush-osds-per-failure-domain is specified. Results in - a CRUSH MSR rule being created. * **crush-device-class**: the device class on which to place data [default: none, which means that all devices are used]. * **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the @@ -733,21 +726,6 @@ The relevant erasure-code profile properties are as follows: argument is omitted, then Ceph will create the CRUSH rule automatically. -CRUSH MSR Rules ---------------- - -Creating an erasure-code profile with a crush-osds-per-failure-domain -value greater than one will cause a CRUSH MSR rule type to be created -instead of a normal CRUSH rule. Normal crush rules cannot retry prior -steps when an out OSD is encountered and rely on CHOOSELEAF steps to -permit moving OSDs to new hosts. However, CHOOSELEAF rules don't -support more than a single OSD per failure domain. MSR rules, new in -squid, support multiple OSDs per failure domain by retrying all prior -steps when an out OSD is encountered. Using MSR rules requires that -OSDs and clients be required to support the CRUSH_MSR feature bit -(squid or newer). - - Deleting rules -------------- diff --git a/qa/erasure-code/ec-rados-plugin=jerasure-k=4-m=2.yaml b/qa/erasure-code/ec-rados-plugin=jerasure-k=4-m=2.yaml index a0cd68a55f534..dfcc61607a7d0 100644 --- a/qa/erasure-code/ec-rados-plugin=jerasure-k=4-m=2.yaml +++ b/qa/erasure-code/ec-rados-plugin=jerasure-k=4-m=2.yaml @@ -11,9 +11,7 @@ tasks: k: 4 m: 2 technique: reed_sol_van - crush-failure-domain: host - crush-osds-per-failure-domain: 2 - crush-num-failure-domains: 3 + crush-failure-domain: osd op_weights: read: 100 write: 0 diff --git a/qa/tasks/mgr/dashboard/test_erasure_code_profile.py b/qa/tasks/mgr/dashboard/test_erasure_code_profile.py index a509140089343..7fb7c1c8270fa 100644 --- a/qa/tasks/mgr/dashboard/test_erasure_code_profile.py +++ b/qa/tasks/mgr/dashboard/test_erasure_code_profile.py @@ -79,7 +79,7 @@ def test_create_plugin(self): self.assertStatus(201) self._get('/api/erasure_code_profile/lrc') - self.assertJsonSubset({ + self.assertJsonBody({ 'crush-device-class': '', 'crush-failure-domain': 'host', 'crush-root': 'default', diff --git a/src/crush/CrushCompiler.cc b/src/crush/CrushCompiler.cc index c884caed00e6b..5e51aad8dba49 100644 --- a/src/crush/CrushCompiler.cc +++ b/src/crush/CrushCompiler.cc @@ -321,13 +321,6 @@ int CrushCompiler::decompile(ostream &out) if (crush.get_allowed_bucket_algs() != CRUSH_LEGACY_ALLOWED_BUCKET_ALGS) out << "tunable allowed_bucket_algs " << crush.get_allowed_bucket_algs() << "\n"; - if (crush.has_nondefault_tunables_msr()) { - out << "tunable msr_descents " << crush.get_msr_descents() - << "\n"; - out << "tunable msr_collision_tries " - << crush.get_msr_collision_tries() - << "\n"; - } out << "\n# devices\n"; for (int i=0; ichildren[start+2]); int type; if (tname == "replicated") - type = CRUSH_RULE_TYPE_REPLICATED; + type = CEPH_PG_TYPE_REPLICATED; else if (tname == "erasure") - type = CRUSH_RULE_TYPE_ERASURE; - else if (tname == "msr_firstn") - type = CRUSH_RULE_TYPE_MSR_FIRSTN; - else if (tname == "msr_indep") - type = CRUSH_RULE_TYPE_MSR_INDEP; + type = CEPH_PG_TYPE_ERASURE; else ceph_abort(); @@ -942,18 +905,6 @@ int CrushCompiler::parse_rule(iter_t const& i) crush.set_rule_step_set_chooseleaf_stable(ruleno, step++, val); } break; - case crush_grammar::_step_set_msr_descents: - { - int val = int_node(s->children[1]); - crush.set_rule_step_set_msr_descents(ruleno, step++, val); - } - break; - case crush_grammar::_step_set_msr_collision_tries: - { - int val = int_node(s->children[1]); - crush.set_rule_step_set_msr_collision_tries(ruleno, step++, val); - } - break; case crush_grammar::_step_choose: case crush_grammar::_step_chooseleaf: @@ -981,17 +932,6 @@ int CrushCompiler::parse_rule(iter_t const& i) } break; - case crush_grammar::_step_choose_msr: - { - string type = string_node(s->children[3]); - if (!type_id.count(type)) { - err << "in rule '" << rname << "' type '" << type << "' not defined" << std::endl; - return -1; - } - crush.set_rule_step_choose_msr(ruleno, step++, int_node(s->children[1]), type_id[type]); - } - break; - case crush_grammar::_step_emit: crush.set_rule_step_emit(ruleno, step++); break; diff --git a/src/crush/CrushWrapper.cc b/src/crush/CrushWrapper.cc index 4850e36f9b5cb..0f40e6875e1be 100644 --- a/src/crush/CrushWrapper.cc +++ b/src/crush/CrushWrapper.cc @@ -135,29 +135,6 @@ bool CrushWrapper::is_v5_rule(unsigned ruleid) const return false; } -bool CrushWrapper::has_msr_rules() const -{ - for (unsigned i=0; imax_rules; i++) { - if (is_msr_rule(i)) { - return true; - } - } - return false; -} - -bool CrushWrapper::is_msr_rule(unsigned ruleid) const -{ - if (ruleid >= crush->max_rules) - return false; - - crush_rule *r = crush->rules[ruleid]; - if (!r) - return false; - - return r->type == CRUSH_RULE_TYPE_MSR_INDEP || - r->type == CRUSH_RULE_TYPE_MSR_FIRSTN; -} - bool CrushWrapper::has_choose_args() const { return !choose_args.empty(); @@ -2261,7 +2238,6 @@ void CrushWrapper::reweight_bucket( int CrushWrapper::add_simple_rule_at( string name, string root_name, string failure_domain_name, - int num_failure_domains, string device_class, string mode, int rule_type, int rno, @@ -2333,19 +2309,17 @@ int CrushWrapper::add_simple_rule_at( } crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); if (type) - crush_rule_set_step( - rule, step++, - mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN : - CRUSH_RULE_CHOOSELEAF_INDEP, - num_failure_domains <= 0 ? CRUSH_CHOOSE_N : num_failure_domains, - type); + crush_rule_set_step(rule, step++, + mode == "firstn" ? CRUSH_RULE_CHOOSELEAF_FIRSTN : + CRUSH_RULE_CHOOSELEAF_INDEP, + CRUSH_CHOOSE_N, + type); else - crush_rule_set_step( - rule, step++, - mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN : - CRUSH_RULE_CHOOSE_INDEP, - num_failure_domains <= 0 ? CRUSH_CHOOSE_N : num_failure_domains, - 0); + crush_rule_set_step(rule, step++, + mode == "firstn" ? CRUSH_RULE_CHOOSE_FIRSTN : + CRUSH_RULE_CHOOSE_INDEP, + CRUSH_CHOOSE_N, + 0); crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); int ret = crush_add_rule(crush, rule, rno); @@ -2361,125 +2335,13 @@ int CrushWrapper::add_simple_rule_at( int CrushWrapper::add_simple_rule( string name, string root_name, string failure_domain_name, - int num_failure_domains, string device_class, string mode, int rule_type, ostream *err) { - return add_simple_rule_at( - name, root_name, failure_domain_name, num_failure_domains, - device_class, - mode, - rule_type, -1, err); -} - -int CrushWrapper::add_multi_osd_per_failure_domain_rule_at( - string name, string root_name, string failure_domain_name, - int num_failure_domains, - int osds_per_failure_domain, - string device_class, - crush_rule_type rule_type, - int rno, - ostream *err) -{ - if (rule_exists(name)) { - if (err) - *err << "rule " << name << " exists"; - return -EEXIST; - } - if (rno >= 0) { - if (rule_exists(rno)) { - if (err) - *err << "rule with ruleno " << rno << " exists"; - return -EEXIST; - } - } else { - for (rno = 0; rno < get_max_rules(); rno++) { - if (!rule_exists(rno)) - break; - } - } - if (!name_exists(root_name)) { - if (err) - *err << "root item " << root_name << " does not exist"; - return -ENOENT; - } - int root = get_item_id(root_name); - int type = 0; - if (failure_domain_name.length()) { - type = get_type_id(failure_domain_name); - if (type < 0) { - if (err) - *err << "unknown type " << failure_domain_name; - return -EINVAL; - } - } - if (device_class.size()) { - if (!class_exists(device_class)) { - if (err) - *err << "device class " << device_class << " does not exist"; - return -EINVAL; - } - int c = get_class_id(device_class); - if (class_bucket.count(root) == 0 || - class_bucket[root].count(c) == 0) { - if (err) - *err << "root " << root_name << " has no devices with class " - << device_class; - return -EINVAL; - } - root = class_bucket[root][c]; - } - if (rule_type != CRUSH_RULE_TYPE_MSR_INDEP && - rule_type != CRUSH_RULE_TYPE_MSR_FIRSTN) { - if (err) - *err << "unknown rule_type " << rule_type; - return -EINVAL; - } - - int steps = 4; - crush_rule *rule = crush_make_rule(steps, rule_type); - ceph_assert(rule); - int step = 0; - crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0); - crush_rule_set_step(rule, step++, - CRUSH_RULE_CHOOSE_MSR, - num_failure_domains, - type); - crush_rule_set_step(rule, step++, - CRUSH_RULE_CHOOSE_MSR, - osds_per_failure_domain, - 0); - crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0); - - int ret = crush_add_rule(crush, rule, rno); - if(ret < 0) { - *err << "failed to add rule " << rno << " because " << cpp_strerror(ret); - return ret; - } - set_rule_name(rno, name); - have_rmaps = false; - return rno; -} - - -int CrushWrapper::add_indep_multi_osd_per_failure_domain_rule( - string name, string root_name, - string failure_domain_name, - int num_failure_domains, - int osds_per_failure_domain, - string device_class, - ostream *err) -{ - return add_multi_osd_per_failure_domain_rule_at( - name, root_name, - failure_domain_name, - num_failure_domains, - osds_per_failure_domain, - device_class, - CRUSH_RULE_TYPE_MSR_INDEP, - -1, - err); + return add_simple_rule_at(name, root_name, failure_domain_name, device_class, + mode, + rule_type, -1, err); } float CrushWrapper::_get_take_weight_osd_map(int root, @@ -3218,10 +3080,6 @@ void CrushWrapper::encode(bufferlist& bl, uint64_t features) const } } } - if (HAVE_FEATURE(features, CRUSH_MSR)) { - encode(crush->msr_descents, bl); - encode(crush->msr_collision_tries, bl); - } } static void decode_32_or_64_string_map(map& m, bufferlist::const_iterator& blp) @@ -3372,12 +3230,6 @@ void CrushWrapper::decode(bufferlist::const_iterator& blp) choose_args[choose_args_index] = arg_map; } } - if (!blp.end()) { - decode(crush->msr_descents, blp); - decode(crush->msr_collision_tries, blp); - } else { - set_default_msr_tunables(); - } update_choose_args(nullptr); // in case we decode a legacy "corrupted" map finalize(); } @@ -3633,8 +3485,6 @@ void CrushWrapper::dump_tunables(Formatter *f) const f->dump_int("chooseleaf_descend_once", get_chooseleaf_descend_once()); f->dump_int("chooseleaf_vary_r", get_chooseleaf_vary_r()); f->dump_int("chooseleaf_stable", get_chooseleaf_stable()); - f->dump_int("msr_descents", get_msr_descents()); - f->dump_int("msr_collision_tries", get_msr_collision_tries()); f->dump_int("straw_calc_version", get_straw_calc_version()); f->dump_int("allowed_bucket_algs", get_allowed_bucket_algs()); @@ -3665,7 +3515,6 @@ void CrushWrapper::dump_tunables(Formatter *f) const f->dump_int("has_v4_buckets", (int)has_v4_buckets()); f->dump_int("require_feature_tunables5", (int)has_nondefault_tunables5()); f->dump_int("has_v5_rules", (int)has_v5_rules()); - f->dump_int("has_msr_rules", (int)has_msr_rules()); } void CrushWrapper::dump_choose_args(Formatter *f) const @@ -3764,11 +3613,6 @@ void CrushWrapper::dump_rule(int rule_id, Formatter *f) const f->dump_int("num", get_rule_arg1(rule_id, j)); f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j))); break; - case CRUSH_RULE_CHOOSE_MSR: - f->dump_string("op", "choosemsr"); - f->dump_int("num", get_rule_arg1(rule_id, j)); - f->dump_string("type", get_type_name(get_rule_arg2(rule_id, j))); - break; case CRUSH_RULE_SET_CHOOSE_TRIES: f->dump_string("op", "set_choose_tries"); f->dump_int("num", get_rule_arg1(rule_id, j)); @@ -3777,14 +3621,6 @@ void CrushWrapper::dump_rule(int rule_id, Formatter *f) const f->dump_string("op", "set_chooseleaf_tries"); f->dump_int("num", get_rule_arg1(rule_id, j)); break; - case CRUSH_RULE_SET_MSR_DESCENTS: - f->dump_string("op", "set_msr_descents"); - f->dump_int("num", get_rule_arg1(rule_id, j)); - break; - case CRUSH_RULE_SET_MSR_COLLISION_TRIES: - f->dump_string("op", "set_msr_collision_tries"); - f->dump_int("num", get_rule_arg1(rule_id, j)); - break; default: f->dump_int("opcode", get_rule_op(rule_id, j)); f->dump_int("arg1", get_rule_arg1(rule_id, j)); diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h index 317f4c28bdd6f..b8caa24ce621c 100644 --- a/src/crush/CrushWrapper.h +++ b/src/crush/CrushWrapper.h @@ -125,7 +125,6 @@ class CrushWrapper { crush->chooseleaf_vary_r = 0; crush->chooseleaf_stable = 0; crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; - set_default_msr_tunables(); } void set_tunables_bobtail() { crush->choose_local_tries = 0; @@ -135,7 +134,6 @@ class CrushWrapper { crush->chooseleaf_vary_r = 0; crush->chooseleaf_stable = 0; crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; - set_default_msr_tunables(); } void set_tunables_firefly() { crush->choose_local_tries = 0; @@ -145,7 +143,6 @@ class CrushWrapper { crush->chooseleaf_vary_r = 1; crush->chooseleaf_stable = 0; crush->allowed_bucket_algs = CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; - set_default_msr_tunables(); } void set_tunables_hammer() { crush->choose_local_tries = 0; @@ -159,7 +156,6 @@ class CrushWrapper { (1 << CRUSH_BUCKET_LIST) | (1 << CRUSH_BUCKET_STRAW) | (1 << CRUSH_BUCKET_STRAW2); - set_default_msr_tunables(); } void set_tunables_jewel() { crush->choose_local_tries = 0; @@ -173,7 +169,6 @@ class CrushWrapper { (1 << CRUSH_BUCKET_LIST) | (1 << CRUSH_BUCKET_STRAW) | (1 << CRUSH_BUCKET_STRAW2); - set_default_msr_tunables(); } void set_tunables_legacy() { @@ -238,24 +233,6 @@ class CrushWrapper { crush->straw_calc_version = n; } - int get_msr_descents() const { - return crush->msr_descents; - } - void set_msr_descents(int n) { - crush->msr_descents = n; - } - - int get_msr_collision_tries() const { - return crush->msr_collision_tries; - } - void set_msr_collision_tries(int n) { - crush->msr_collision_tries = n; - } - void set_default_msr_tunables() { - set_msr_descents(100); - set_msr_collision_tries(100); - } - unsigned get_allowed_bucket_algs() const { return crush->allowed_bucket_algs; } @@ -271,8 +248,7 @@ class CrushWrapper { crush->chooseleaf_descend_once == 0 && crush->chooseleaf_vary_r == 0 && crush->chooseleaf_stable == 0 && - crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS && - !has_nondefault_tunables_msr(); + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; } bool has_bobtail_tunables() const { return @@ -282,8 +258,7 @@ class CrushWrapper { crush->chooseleaf_descend_once == 1 && crush->chooseleaf_vary_r == 0 && crush->chooseleaf_stable == 0 && - crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS && - !has_nondefault_tunables_msr(); + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; } bool has_firefly_tunables() const { return @@ -293,8 +268,7 @@ class CrushWrapper { crush->chooseleaf_descend_once == 1 && crush->chooseleaf_vary_r == 1 && crush->chooseleaf_stable == 0 && - crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS && - !has_nondefault_tunables_msr(); + crush->allowed_bucket_algs == CRUSH_LEGACY_ALLOWED_BUCKET_ALGS; } bool has_hammer_tunables() const { return @@ -307,8 +281,7 @@ class CrushWrapper { crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) | (1 << CRUSH_BUCKET_LIST) | (1 << CRUSH_BUCKET_STRAW) | - (1 << CRUSH_BUCKET_STRAW2)) && - !has_nondefault_tunables_msr(); + (1 << CRUSH_BUCKET_STRAW2)); } bool has_jewel_tunables() const { return @@ -321,8 +294,7 @@ class CrushWrapper { crush->allowed_bucket_algs == ((1 << CRUSH_BUCKET_UNIFORM) | (1 << CRUSH_BUCKET_LIST) | (1 << CRUSH_BUCKET_STRAW) | - (1 << CRUSH_BUCKET_STRAW2)) && - !has_nondefault_tunables_msr(); + (1 << CRUSH_BUCKET_STRAW2)); } bool has_optimal_tunables() const { @@ -350,11 +322,6 @@ class CrushWrapper { return crush->chooseleaf_stable != 0; } - bool has_nondefault_tunables_msr() const { - return - crush->msr_descents != 100 || - crush->msr_collision_tries != 100; - } bool has_v2_rules() const; bool has_v3_rules() const; @@ -362,17 +329,13 @@ class CrushWrapper { bool has_v5_rules() const; bool has_choose_args() const; // any choose_args bool has_incompat_choose_args() const; // choose_args that can't be made compat - bool has_msr_rules() const; bool is_v2_rule(unsigned ruleid) const; bool is_v3_rule(unsigned ruleid) const; bool is_v5_rule(unsigned ruleid) const; - bool is_msr_rule(unsigned ruleid) const; std::string get_min_required_version() const { - if (has_msr_rules() || has_nondefault_tunables_msr()) - return "squid"; - else if (has_v5_rules() || has_nondefault_tunables5()) + if (has_v5_rules() || has_nondefault_tunables5()) return "jewel"; else if (has_v4_buckets()) return "hammer"; @@ -602,21 +565,6 @@ class CrushWrapper { if (have_rmaps) rule_name_rmap[name] = i; } - bool rule_valid_for_pool_type(int rule_id, int ptype) const { - auto rule_type = get_rule_type(rule_id); - switch (ptype) { - case CEPH_PG_TYPE_REPLICATED: - return rule_type == CRUSH_RULE_TYPE_REPLICATED || - rule_type == CRUSH_RULE_TYPE_MSR_FIRSTN; - case CEPH_PG_TYPE_ERASURE: - return rule_type == CRUSH_RULE_TYPE_ERASURE || - rule_type == CRUSH_RULE_TYPE_MSR_INDEP; - default: - ceph_assert(0 == "impossible"); - return false; - } - } - bool is_shadow_item(int id) const { const char *name = get_item_name(id); return name && !is_valid_crush_name(name); @@ -1203,14 +1151,6 @@ class CrushWrapper { int set_rule_step_set_chooseleaf_stable(unsigned ruleno, unsigned step, int val) { return set_rule_step(ruleno, step, CRUSH_RULE_SET_CHOOSELEAF_STABLE, val, 0); } - - int set_rule_step_set_msr_descents(unsigned ruleno, unsigned step, int val) { - return set_rule_step(ruleno, step, CRUSH_RULE_SET_MSR_DESCENTS, val, 0); - } - int set_rule_step_set_msr_collision_tries(unsigned ruleno, unsigned step, int val) { - return set_rule_step(ruleno, step, CRUSH_RULE_SET_MSR_COLLISION_TRIES, val, 0); - } - int set_rule_step_choose_firstn(unsigned ruleno, unsigned step, int val, int type) { return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_FIRSTN, val, type); } @@ -1223,61 +1163,22 @@ class CrushWrapper { int set_rule_step_choose_leaf_indep(unsigned ruleno, unsigned step, int val, int type) { return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSELEAF_INDEP, val, type); } - int set_rule_step_choose_msr(unsigned ruleno, unsigned step, int val, int type) { - return set_rule_step(ruleno, step, CRUSH_RULE_CHOOSE_MSR, val, type); - } int set_rule_step_emit(unsigned ruleno, unsigned step) { return set_rule_step(ruleno, step, CRUSH_RULE_EMIT, 0, 0); } int add_simple_rule( std::string name, std::string root_name, std::string failure_domain_type, - int num_failure_domains, std::string device_class, std::string mode, int rule_type, std::ostream *err = 0); - int add_simple_rule( - std::string name, std::string root_name, std::string failure_domain_type, - std::string device_class, std::string mode, int rule_type, - std::ostream *err = 0) { - return add_simple_rule( - name, root_name, failure_domain_type, -1, - device_class, mode, rule_type, err); - } - - int add_indep_multi_osd_per_failure_domain_rule( - std::string name, std::string root_name, std::string failure_domain_type, - int osds_per_failure_domain, - int num_failure_domains, - std::string device_class, - std::ostream *err = 0); /** * @param rno rule[set] id to use, -1 to pick the lowest available */ int add_simple_rule_at( std::string name, std::string root_name, - std::string failure_domain_type, - int num_failure_domains, - std::string device_class, std::string mode, + std::string failure_domain_type, std::string device_class, std::string mode, int rule_type, int rno, std::ostream *err = 0); - int add_simple_rule_at( - std::string name, std::string root_name, - std::string failure_domain_type, - std::string device_class, std::string mode, - int rule_type, int rno, std::ostream *err = 0) { - return add_simple_rule_at( - name, root_name, failure_domain_type, -1, - device_class, mode, rule_type, rno, err); - } - - int add_multi_osd_per_failure_domain_rule_at( - std::string name, std::string root_name, std::string failure_domain_type, - int osds_per_failure_domain, - int num_failure_domains, - std::string device_class, - crush_rule_type rule_type, - int rno, - std::ostream *err = 0); int remove_rule(int ruleno); diff --git a/src/crush/crush.h b/src/crush/crush.h index bdcdc97eef2b9..fde2df6a8a3ec 100644 --- a/src/crush/crush.h +++ b/src/crush/crush.h @@ -65,15 +65,7 @@ enum crush_opcodes { CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12, - CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13, - - /* set choose_msr_total_tries */ - CRUSH_RULE_SET_MSR_DESCENTS = 14, - /* set choose_msr_local_collision_tries */ - CRUSH_RULE_SET_MSR_COLLISION_TRIES = 15, - - /* choose variant without FIRSTN|INDEP */ - CRUSH_RULE_CHOOSE_MSR = 16 + CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13 }; /* @@ -95,12 +87,7 @@ struct crush_rule { #define crush_rule_size(len) (sizeof(struct crush_rule) + \ (len)*sizeof(struct crush_rule_step)) -enum crush_rule_type { - CRUSH_RULE_TYPE_REPLICATED = 1, - CRUSH_RULE_TYPE_ERASURE = 3, - CRUSH_RULE_TYPE_MSR_FIRSTN = 4, - CRUSH_RULE_TYPE_MSR_INDEP = 5 -}; + /* * A bucket is a named container of other items (either devices or @@ -423,12 +410,6 @@ struct crush_map { */ __u8 chooseleaf_stable; - /*! Sets total descents for MSR rules */ - __u8 msr_descents; - - /*! Sets local collision retries for MSR rules */ - __u8 msr_collision_tries; - /*! @cond INTERNAL */ /* This value is calculated after decode or construction by the builder. It is exposed here (rather than having a diff --git a/src/crush/grammar.h b/src/crush/grammar.h index 0c9a2da7d7706..b5c356a49e17e 100644 --- a/src/crush/grammar.h +++ b/src/crush/grammar.h @@ -50,11 +50,8 @@ struct crush_grammar : public boost::spirit::grammar _step_set_choose_tries, _step_set_choose_local_tries, _step_set_choose_local_fallback_tries, - _step_set_msr_descents, - _step_set_msr_collision_tries, _step_choose, _step_chooseleaf, - _step_choose_msr, _step_emit, _step, _crushrule, @@ -94,11 +91,8 @@ struct crush_grammar : public boost::spirit::grammar boost::spirit::rule, boost::spirit::parser_tag<_step_set_chooseleaf_tries> > step_set_chooseleaf_tries; boost::spirit::rule, boost::spirit::parser_tag<_step_set_chooseleaf_vary_r> > step_set_chooseleaf_vary_r; boost::spirit::rule, boost::spirit::parser_tag<_step_set_chooseleaf_stable> > step_set_chooseleaf_stable; - boost::spirit::rule, boost::spirit::parser_tag<_step_set_msr_descents> > step_set_msr_descents; - boost::spirit::rule, boost::spirit::parser_tag<_step_set_msr_collision_tries> > step_set_msr_collision_tries; boost::spirit::rule, boost::spirit::parser_tag<_step_choose> > step_choose; boost::spirit::rule, boost::spirit::parser_tag<_step_chooseleaf> > step_chooseleaf; - boost::spirit::rule, boost::spirit::parser_tag<_step_choose_msr> > step_choose_msr; boost::spirit::rule, boost::spirit::parser_tag<_step_emit> > step_emit; boost::spirit::rule, boost::spirit::parser_tag<_step> > step; boost::spirit::rule, boost::spirit::parser_tag<_crushrule> > crushrule; @@ -155,8 +149,6 @@ struct crush_grammar : public boost::spirit::grammar step_set_chooseleaf_tries = str_p("set_chooseleaf_tries") >> posint; step_set_chooseleaf_vary_r = str_p("set_chooseleaf_vary_r") >> posint; step_set_chooseleaf_stable = str_p("set_chooseleaf_stable") >> posint; - step_set_msr_descents = str_p("set_msr_descents") >> posint; - step_set_msr_collision_tries = str_p("set_msr_collision_tries") >> posint; step_choose = str_p("choose") >> ( str_p("indep") | str_p("firstn") ) >> integer @@ -165,9 +157,6 @@ struct crush_grammar : public boost::spirit::grammar >> ( str_p("indep") | str_p("firstn") ) >> integer >> str_p("type") >> name; - step_choose_msr = str_p("choosemsr") - >> integer - >> str_p("type") >> name; step_emit = str_p("emit"); step = str_p("step") >> ( step_take | step_set_choose_tries | @@ -176,15 +165,12 @@ struct crush_grammar : public boost::spirit::grammar step_set_chooseleaf_tries | step_set_chooseleaf_vary_r | step_set_chooseleaf_stable | - step_set_msr_descents | - step_set_msr_collision_tries | step_choose | step_chooseleaf | - step_choose_msr | step_emit ); crushrule = str_p("rule") >> !name >> '{' >> (str_p("id") | str_p("ruleset")) >> posint - >> str_p("type") >> ( str_p("replicated") | str_p("erasure") | str_p("msr_firstn") | str_p("msr_indep") ) + >> str_p("type") >> ( str_p("replicated") | str_p("erasure") ) >> !(str_p("min_size") >> posint) >> !(str_p("max_size") >> posint) >> +step diff --git a/src/crush/mapper.c b/src/crush/mapper.c index afeaffc5a8d83..736cc6162c974 100644 --- a/src/crush/mapper.c +++ b/src/crush/mapper.c @@ -27,9 +27,6 @@ #define dprintk(args...) /* printf(args) */ -#define MIN(x, y) ((x) > (y) ? (y) : (x)) -#define MAX(y, x) ((x) < (y) ? (y) : (x)) - /* * Implement the core CRUSH mapping algorithm. */ @@ -823,11 +820,65 @@ static void crush_choose_indep(const struct crush_map *map, #endif } -static int crush_do_rule_no_retry( - const struct crush_map *map, - int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max, - void *cwin, const struct crush_choose_arg *choose_args) + +/* This takes a chunk of memory and sets it up to be a shiny new + working area for a CRUSH placement computation. It must be called + on any newly allocated memory before passing it in to + crush_do_rule. It may be used repeatedly after that, so long as the + map has not changed. If the map /has/ changed, you must make sure + the working size is no smaller than what was allocated and re-run + crush_init_workspace. + + If you do retain the working space between calls to crush, make it + thread-local. If you reinstitute the locking I've spent so much + time getting rid of, I will be very unhappy with you. */ + +void crush_init_workspace(const struct crush_map *m, void *v) { + /* We work by moving through the available space and setting + values and pointers as we go. + + It's a bit like Forth's use of the 'allot' word since we + set the pointer first and then reserve the space for it to + point to by incrementing the point. */ + struct crush_work *w = (struct crush_work *)v; + char *point = (char *)v; + __s32 b; + point += sizeof(struct crush_work); + w->work = (struct crush_work_bucket **)point; + point += m->max_buckets * sizeof(struct crush_work_bucket *); + for (b = 0; b < m->max_buckets; ++b) { + if (m->buckets[b] == 0) + continue; + + w->work[b] = (struct crush_work_bucket *) point; + switch (m->buckets[b]->alg) { + default: + point += sizeof(struct crush_work_bucket); + break; + } + w->work[b]->perm_x = 0; + w->work[b]->perm_n = 0; + w->work[b]->perm = (__u32 *)point; + point += m->buckets[b]->size * sizeof(__u32); + } + BUG_ON((char *)point - (char *)w != m->working_size); +} + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @cwin: Pointer to at least map->working_size bytes of memory or NULL. + */ +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args) { int result_len; struct crush_work *cw = cwin; @@ -1030,1006 +1081,3 @@ static int crush_do_rule_no_retry( return result_len; } - -/// invariant through crush_msr_do_rule invocation -struct crush_msr_input { - const struct crush_map *map; - const struct crush_rule *rule; - - const unsigned result_max; - - const unsigned weight_len; - const __u32 *weights; - - const int map_input; - const struct crush_choose_arg *choose_args; - - const unsigned msr_descents; - const unsigned msr_collision_tries; -}; - -/// encapsulates work space, invariant within an EMIT block -struct crush_msr_workspace { - const unsigned start_stepno; - const unsigned end_stepno; - - const unsigned result_len; - - const struct crush_work *crush_work; - - // int[end_stepno - start_stepno][result_len] - int **step_vecs; -}; - -/// encapsulates output space, invariant through crush_msr_do_rule invocation -struct crush_msr_output { - const unsigned result_len; - unsigned returned_so_far; - int *out; -}; - -/** - * crush_msr_scan_config_steps - * - * Scans possibly empty sequence of CRUSH_RULE_SET_CHOOSE_MSR_*_TRIES - * steps at the start of the rule. Returns index of next step. - * Populates *msr_descents and *msr_collision_tries (if non-null) with - * last matching rule. - * @steps: steps to scan - * @step_len: length of steps - * @msr_descents: out param for CRUSH_RULE_SET_MSR_DESCENTS - * @msr_collision_tries: out param for CRUSH_RULE_SET_MSR_COLLISION_TRIES - */ -static unsigned crush_msr_scan_config_steps( - const struct crush_rule_step *steps, - unsigned step_len, - unsigned *msr_descents, - unsigned *msr_collision_tries) { - unsigned stepno = 0; - for (; stepno < step_len; ++stepno) { - const struct crush_rule_step *step = &steps[stepno]; - switch (step->op) { - case CRUSH_RULE_SET_MSR_DESCENTS: - if (msr_descents) *msr_descents = step->arg1; - break; - case CRUSH_RULE_SET_MSR_COLLISION_TRIES: - if (msr_collision_tries) *msr_collision_tries = step->arg1; - break; - default: - return stepno; - } - } - return stepno; -} - -/// clear workspace represented by *ws -static void crush_msr_clear_workspace( - struct crush_msr_workspace *ws) -{ - for (unsigned stepno = ws->start_stepno; stepno < ws->end_stepno; - ++stepno) { - for (unsigned i = 0; i < ws->result_len; ++i) { - ws->step_vecs[stepno - ws->start_stepno][i] = - CRUSH_ITEM_UNDEF; - } - } -} - -/** - * crush_msr_scan_next - * - * Validates an EMIT block of the form (TAKE CHOOSE_MSR* EMIT) - * If sequence is valid, populates total_children with the width - * of the mapping from the choose steps and next_emit with the - * index of the next EMIT step. - * - * @rule: rule to scan - * @result_max: max number of results to return - * @max_steps: length of longest string of choosemsr steps - * @return 0 if valid, -1 if there were validation errors - */ -static int crush_msr_scan_next( - const struct crush_rule *rule, - unsigned result_max, - unsigned stepno, - unsigned *total_children, - unsigned *next_emit) -{ - if (stepno + 1 >= rule->len) { - dprintk("stepno too large\n"); - return -1; - } - if (rule->steps[stepno].op != CRUSH_RULE_TAKE) { - dprintk("first rule not CRUSH_RULE_TAKE\n"); - return -1; - } - ++stepno; - - if (total_children) *total_children = 1; - for (; stepno < rule->len; ++stepno) { - const struct crush_rule_step *curstep = - &(rule->steps[stepno]); - if (curstep->op == CRUSH_RULE_EMIT) { - break; - } - if (rule->steps[stepno].op != CRUSH_RULE_CHOOSE_MSR) { - dprintk("found non-choose non-emit step %d\n", stepno); - return -1; - } - if (total_children) { - *total_children *= curstep->arg1 ? curstep->arg1 - : result_max; - } - } - if (stepno >= rule->len) { - dprintk("did not find emit\n"); - return -1; - } - if (next_emit) { - *next_emit = stepno; - } - return 0; -} - -/** - * crush_msr_scan_rule - * - * MSR rules must have the form: - * 1) Possibly empty sequence of CRUSH_RULE_SET_CHOOSE_MSR_.*_TRIES steps - * 2) A sequence of EMIT blocks of the form - * (TAKE CHOOSE_MSR* EMIT)* - * - * crush_msr_scan_rule validates that the form obeys the above form and - * popualtes max_steps with the length of the longest sequence of CHOOSE_MSR - * steps. - * - * crush_msr_scan_rule replicates the scan behavior of crush_msr_do_rule. - * - * @rule: rule to scan - * @result_max: max number of results to return - * @max_steps: length of longest string of choosemsr steps - * @return 0 if valid, -1 otherwise - */ -static int crush_msr_scan_rule( - const struct crush_rule *rule, - unsigned result_max, - unsigned *max_steps) -{ - if (max_steps) *max_steps = 0; - unsigned next_stepno = crush_msr_scan_config_steps( - rule->steps, - rule->len, - NULL, NULL); - while (next_stepno < rule->len) { - unsigned next_emit_stepno; - int r = crush_msr_scan_next( - rule, result_max, next_stepno, - NULL, &next_emit_stepno); - if (r < 0) return r; - - if (max_steps) { - *max_steps = MAX( - *max_steps, - next_emit_stepno - (next_stepno + 1)); - } - next_stepno = next_emit_stepno + 1; - } - return 0; -} - -/// Returns true if all leaf slots in [start, end) are mapped -static int crush_msr_leaf_vec_populated( - const struct crush_msr_workspace *workspace, - const unsigned start, const unsigned end) -{ - BUG_ON(start >= end); - BUG_ON(end > workspace->result_len); - BUG_ON(workspace->end_stepno <= workspace->start_stepno); - // we check the last step vector here because output - // won't be ordered by index for FIRSTN rules - int *leaf_vec = workspace->step_vecs[ - workspace->end_stepno - workspace->start_stepno - 1]; - for (unsigned i = start; i < end; ++i) { - if (leaf_vec[i] == CRUSH_ITEM_UNDEF) { - return 0; - } - } - return 1; -} - -/// Returns try value to pass to crush based on index, tries, and local_tries -static unsigned crush_msr_get_retry_value( - const unsigned result_max, - const unsigned index, - const unsigned msr_descents, - const unsigned msr_collision_tries) -{ - const unsigned total_index = (msr_descents * result_max) + index; - return (total_index << 16) + msr_collision_tries; -} - -/** - * crush_msr_descend - * - * Descend recursively from bucket until we either hit a leaf or an - * interior node of type type. - * @input: crush input information - * @workspace: struct with working space - * @bucket: bucket from which to descend - * @type: target node type - * @tryno: top level try number, incremented with each call into crush_msr_choose - * from crush_msr_do_rule - * @local_tryno: local collision try number, incremented with each call into - * crush_msr_descend from crush_msr_choose after collision - * @index: mapping index - */ -static int crush_msr_descend( - const struct crush_msr_input *input, - const struct crush_msr_workspace *workspace, - const struct crush_bucket *bucket, - const int type, - const unsigned tryno, - const unsigned local_tryno, - const unsigned index) -{ - dprintk(" crush_msr_descend type %d tryno %d local_tryno %d index %d\n", - type, tryno, local_tryno, index); - while (1) { - const int child_bucket_candidate = crush_bucket_choose( - bucket, - workspace->crush_work->work[-1 - bucket->id], - input->map_input, - crush_msr_get_retry_value( - input->result_max, - index, tryno, local_tryno), - (input->choose_args ? - &(input->choose_args[-1 - bucket->id]) : 0), - index); - - if (child_bucket_candidate >= 0) { - return child_bucket_candidate; - } - - bucket = input->map->buckets[-1 - child_bucket_candidate]; - if (bucket->type == type) { - return child_bucket_candidate; - } - } -} - -/** - * crush_msr_valid_candidate - * - * Checks whether candidate is a valid choice given buckets already - * mapped for step stepno. - * - * If candidate has already been mapped for a position in - * [include_start, include_end), candidate is valid. - * - * Else, if candidate has already been mapped for a position in - * [exclude_start, exclude_end), candidate is invalid. - * - * Otherwise, candidate is valid. - * - * @stepno: step to check - * @exclude_start: start of exclusion range - * @exclude_end: end of exlusion range - * @include_start: start of inclusion range - * @include_end: end of inclusion range - * @candidate: bucket to check - * - * Note, [exclude_start, exclude_end) must contain [include_start, include_end). - */ -static int crush_msr_valid_candidate( - const struct crush_msr_workspace *workspace, - unsigned stepno, - unsigned exclude_start, - unsigned exclude_end, - unsigned include_start, - unsigned include_end, - int candidate) -{ - BUG_ON(stepno >= workspace->end_stepno); - BUG_ON(stepno < workspace->start_stepno); - - BUG_ON(exclude_end <= exclude_start); - BUG_ON(include_end <= include_start); - - BUG_ON(exclude_start > include_start); - BUG_ON(exclude_end < include_end); - - BUG_ON(exclude_end > workspace->result_len); - - int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; - for (unsigned i = exclude_start; i < exclude_end; ++i) { - if (vec[i] == candidate) { - if (i >= include_start && i < include_end) { - dprintk(" crush_msr_valid_candidate: " - "candidate %d already chosen for " - "stride\n", - candidate); - return 1; - } else { - dprintk(" crush_msr_valid_candidate: " - "candidate %d collision\n", - candidate); - return 0; - } - } - } - dprintk(" crush_msr_valid_candidate: candidate %d no collision\n", - candidate); - return 1; -} - -/** - * crush_msr_push_used - * - * See crush_msr_choose for details, used to push bucket indicies onto collision - * set for specified stride. User is responsible for ensuring that - * [stride_start, stride_end) never holds more than stride_end - stride_start - * entries. - * @workspace: holds working space information - * @stepno: index of step - * @stride_start: start of stride - * @stride_end: one past end of stride - * @candidate: element to add to set - * @return 1 if added (not already present), 0 if not added due to already - * being present - */ -static int crush_msr_push_used( - const struct crush_msr_workspace *workspace, - unsigned stepno, - unsigned stride_start, - unsigned stride_end, - int candidate) -{ - BUG_ON(stepno >= workspace->end_stepno); - BUG_ON(stepno < workspace->start_stepno); - - BUG_ON(stride_end <= stride_start); - BUG_ON(stride_end > workspace->result_len); - int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; - for (unsigned i = stride_start; i < stride_end; ++i) { - if (vec[i] == candidate) { - return 0; - } else if (vec[i] == CRUSH_ITEM_UNDEF) { - vec[i] = candidate; - return 1; - } - } - BUG_ON("impossible"); - return 0; -} - -/** - * crush_msr_push_used - * - * See crush_msr_choose for details, used to pop bucket indicies from collision - * set for specified stride. If an element is to be popped, crush_msr_pop_used - * must be called prior to pushing another element. - * @workspace: holds working space information - * @stepno: index of step - * @stride_start: start of stride - * @stride_end: one past end of stride - * @candidate: element to pop from set - */ -static void crush_msr_pop_used( - const struct crush_msr_workspace *workspace, - unsigned stepno, - unsigned stride_start, - unsigned stride_end, - int candidate) -{ - BUG_ON(stepno >= workspace->end_stepno); - BUG_ON(stepno < workspace->start_stepno); - - BUG_ON(stride_end <= stride_start); - BUG_ON(stride_end > workspace->result_len); - int *vec = workspace->step_vecs[stepno - workspace->start_stepno]; - for (unsigned i = stride_end; i > stride_start;) { - --i; - if (vec[i] != CRUSH_ITEM_UNDEF) { - BUG_ON(vec[i] != candidate); - vec[i] = CRUSH_ITEM_UNDEF; - return; - } - } - BUG_ON(0 == "impossible"); -} - -/** - * crush_msr_emit_result - * - * Outputs mapping result from specified position. Position in output - * buffer depends on rule type -- FIRSTN outputs in output order, INDEP - * outputs into specified position. - * @output: output buffer - * @rule_type: CRUSH_RULE_TYPE_MSR_FIRSTN or CRUSH_RULE_TYPE_MSR_INDEP - * @position: mapping position - * @result: mapping value to output - */ -static void crush_msr_emit_result( - struct crush_msr_output *output, - int rule_type, - unsigned position, - int result) -{ - BUG_ON(position >= output->result_len); - BUG_ON(output->returned_so_far >= output->result_len); - if (rule_type == CRUSH_RULE_TYPE_MSR_FIRSTN) { - BUG_ON(output->out[output->returned_so_far] != CRUSH_ITEM_NONE); - output->out[(output->returned_so_far)++] = result; - } else { - BUG_ON(output->out[position] != CRUSH_ITEM_NONE); - output->out[position] = result; - ++output->returned_so_far; - } - dprintk(" emit: %d, returned_so_far: %d\n", - result, output->returned_so_far); -} - -/** - * crush_msr_choose - * - * Performs mapping for a single EMIT block containing CHOOSE steps - * [current_stepno, end_stepno) into mapping indices [start_index, end_index). - * - * Like chooseleaf, crush_msr_choose is essentially depth-first -- it chooses - * an item and all of the descendents under that item before moving to the - * next item. Each choose step in the block gets its own workspace for - * collision detection. - * - * crush_msr_choose (and its recursive calls) will locally retry any bucket - * selections that produce a collision (up to msr_collision_tries times), but - * won't retry if it hits an out osd -- that's handled by calling back into - * crush_msr_choose up to msr_descents times. - * - * @input: crush input information - * @workspace: working space for this EMIT block - * @output: crush mapping output buffer specification - * @total_children: total number of children implied by the step sequence, may - * be larger than end_index - start_index. - * @start_index: start mapping index - * @end_index: end mapping index - * @current_stepno: first choose step - * @end_stepno: one past last choose step, must be an EMIT - * @tryno: try number, see crush_msr_do_rule - */ -static unsigned crush_msr_choose( - const struct crush_msr_input *input, - const struct crush_msr_workspace *workspace, - struct crush_msr_output *output, - const struct crush_bucket *bucket, - const unsigned total_descendants, - const unsigned start_index, const unsigned end_index, - const unsigned current_stepno, const unsigned end_stepno, - const unsigned tryno) -{ - dprintk("crush_msr_choose: bucket %d, start_index %d, end_index %d\n", - bucket->id, start_index, end_index); - - BUG_ON(current_stepno >= input->rule->len); - const struct crush_rule_step *curstep = - &(input->rule->steps[current_stepno]); - BUG_ON(curstep->op != CRUSH_RULE_CHOOSE_MSR); - - /* This call into crush_msr_choose is responsible, ultimately, for - * populating indices [start_index, end_index). We do this by - * dividing that range into a set of strides specified in the - * step -- choosemsr 4 host would dictate that the range be divided - * into 4 strides. - * - * If the full rule is - * - * ... - * step take root - * step choosemsr 4 host - * step choosemsr 2 osd - * step emit - * - * total_descendants for the initial call would be 8 (4*2) with - * num_stride=4 (4 hosts) and stride_length = 2 (2 osds per host). - * For the recursive calls, total_descendants would be 2 (8 / 4), - * stride_length would be 1 and num_strides would be 2. - */ - - // choosemsr 0 host should select result_max hosts - const unsigned num_strides = curstep->arg1 ? curstep->arg1 - : input->result_max; - - // total_descendants is the product of the steps in the block - BUG_ON(total_descendants % num_strides != 0); - const unsigned stride_length = total_descendants / num_strides; - - /* MSR steps like - * - * step choosemsr 4 host - * - * guarantee that the output mapping will be divided into at least - * 4 hosts, not exactly 4 hosts. We achieve this by ensuring that - * the sets of hosts for each stride are disjoint -- a host selected - * for stride 0 will not be used for any other stride. - * - * However, a single stride might end up using more than one host. - * If an OSD on a host is marked out, crush_msr_choose will simply - * skip that index when it hits it. crush_msr_do_rule will then - * call back into crush_msr_choose and eventually find another OSD - * either on the same host or on another one not already used in - * another stride. For this reason, a single stride may need to - * remember up to stride_length entries for collision detection - * purposes. - * - * Unfortunately, we only have stride_length entries to work with - * in workspace. Thus, prior to returning from crush_msr_choose, - * we remove entries that didn't actually result in a mapping. We - * use the following undo vector to achieve this -- any strides that - * didn't result in a successful mapping are set in undo to be undone - * immediately prior to returning. - * - * Why prior to returning and not immediately? Selecting a bucket in - * a stride impacts subsequent choices as they may have collided. In - * order to limit the impact of marking an OSD out, we treat it as - * collidable until the next pass. - */ - int undo[num_strides]; - for (unsigned stride = 0; stride < num_strides; ++stride) { - undo[stride] = CRUSH_ITEM_UNDEF; - } - - dprintk("crush_msr_choose: bucket %d, start_index %d, " - "end_index %d, stride_length %d\n", - bucket->id, start_index, end_index, stride_length); - - unsigned mapped = 0; - unsigned stride_index = 0; - for (unsigned stride_start = start_index; - stride_start < end_index; - stride_start += stride_length, ++stride_index) { - const unsigned stride_end = - MIN(stride_start + stride_length, end_index); - - // all descendants for this stride have been mapped already - if (crush_msr_leaf_vec_populated( - workspace, stride_start, stride_end)) { - continue; - } - - int found = 0; - int child_bucket_candidate; - for (unsigned local_tryno = 0; - local_tryno <= input->msr_collision_tries; - ++local_tryno) { - child_bucket_candidate = crush_msr_descend( - input, workspace, bucket, - curstep->arg2, tryno, local_tryno, - stride_index); - - /* candidate is valid if: - * - we already chose it for this stride - * - it hasn't been chosen for any stride */ - if (crush_msr_valid_candidate( - workspace, - current_stepno, - // Collision on elements in [start_index, end_index) - start_index, end_index, - // ...unless in [stride_start, stride_end) - stride_start, stride_end, - child_bucket_candidate)) { - found = 1; - break; - } - } - - /* failed to find non-colliding choice after msr_collision_tries - * attempts */ - if (!found) continue; - - if (curstep->arg2 == 0 /* leaf */) { - if (stride_length != 1 || - (current_stepno + 1 != end_stepno)) { - /* Either condition above implies that there's - * another step after a choosemsr step for the - * leaf type, rule is malformed, bail */ - continue; - } - if (is_out(input->map, input->weights, - input->weight_len, - child_bucket_candidate, input->map_input)) { - dprintk(" crush_msr_choose: item %d out\n", - child_bucket_candidate); - /* crush_msr_do_rule will try again, - * msr_descents permitting */ - continue; - } - // for collision detection - int pushed = crush_msr_push_used( - workspace, current_stepno, stride_start, stride_end, - child_bucket_candidate); - /* stride_length == 1, can't already be there */ - BUG_ON(!pushed); - // final output, ordering depending on input->rule->type - crush_msr_emit_result( - output, input->rule->type, - stride_start, child_bucket_candidate); - mapped++; - } else /* not leaf */ { - if (current_stepno + 1 >= end_stepno) { - /* Type isn't leaf, rule is malformed since there - * isn't another step */ - continue; - } - struct crush_bucket *child_bucket = input->map->buckets[ - -1 - child_bucket_candidate]; - unsigned child_mapped = crush_msr_choose( - input, workspace, output, - child_bucket, - // total_descendants for recursive call - stride_length, - // recursive call populates - // [stride_start, stride_end) - stride_start, stride_end, - // next step - current_stepno + 1, end_stepno, - tryno); - int pushed = crush_msr_push_used( - workspace, - current_stepno, - stride_start, - stride_end, - child_bucket_candidate); - /* pushed may be false if we already chose this bucket - * for this stride. If so, child_mapped must have been - * != 0 at the time, so we still retain it */ - if (pushed && (child_mapped == 0)) { - // no child mapped, and we didn't choose it - // before - undo[stride_index] = child_bucket_candidate; - } else { - mapped += child_mapped; - } - } - } - - // pop unused buckets - stride_index = 0; - for (unsigned stride_start = start_index; - stride_start < end_index; - stride_start += stride_length, ++stride_index) { - if (undo[stride_index] != CRUSH_ITEM_UNDEF) { - unsigned stride_end = - MIN(stride_start + stride_length, end_index); - crush_msr_pop_used( - workspace, - current_stepno, - stride_start, - stride_end, - undo[stride_index]); - } - } - - return mapped; -} - -/** - * crush_msr_do_rule - calculate a mapping with the given input and msr rule - * - * msr_firstn and msr_indep rules are intended to address a limitation of - * conventional crush rules in that they do not retry steps outside of - * a CHOOSELEAF step. In the case of a crush rule like - * - * rule replicated_rule_1 { - * ... - * step take default class hdd - * step chooseleaf firstn 3 type host - * step emit - * } - * - * the chooseleaf step will ensure that if all of the osds on a - * particular host are marked out, mappings including those OSDs would - * end up on another host (provided that there are enough hosts). - * - * However, if the rule used two choose steps instead - * - * rule replicated_rule_1 { - * ... - * step take default class hdd - * step choose firstn 3 type host - * step choose firstn 1 type osd - * step emit - * } - * - * marking an OSD down could cause it to be remapped to another on the same - * host, but not to another host. If all of the OSDs on a host are marked - * down, the PGs will simply be degraded and unable to remap until the host - * is removed from the CRUSH heirarchy or reweighted to 0. - * - * Normally, we can comfortably work around this by using a chooseleaf - * step as in the first example, but there are cases where we want to map - * multiple OSDs to each host (wide EC codes on small clusters, for - * example) which can't be handled with chooseleaf as it currently - * exists. - * - * rule ecpool-86 { - * type msr_indep - * ... - * step choosemsr 4 type host - * step choosemsr 4 type osd - * step emit - * } - * - * With an 8+6 code, this rule can tolerate a host and a single OSD down without - * becoming unavailable on 4 hosts. It relies on ensuring that no more than 4 - * OSDs are mapped to any single host, however, which can't be done with a - * conventional CRUSH rule without the drawback described above. By using - * msr_indep, this rule can deal with an OSD failure by remapping to another - * host. - * - * MSR rules have some structural differences from conventional rules: - * - The rule type determines whether the mapping is FIRSTN or INDEP. Because - * the descent can retry steps, it doesn't really make sense for steps to - * individually specify output order and I'm not really aware of any use cases - * that would benefit from it. - * - MSR rules *must* be structured as a (possibly empty) prefix of config - * steps (CRUSH_RULE_SET_CHOOSE_MSR*) followed by a sequence of EMIT blocks - * each comprised of a TAKE step, a sequence of CHOOSE_MSR steps, and - * ended by an EMIT step. - * - MSR choose steps must be choosemsr. choose and chooseleaf are not permitted. - * - * MSR rules also have different requirements for working space. Conventional CRUSH - * requires 3 vectors of size result_max to use for working space -- two to alternate - * as it processes each rule and one, additionally, for chooseleaf. MSR rules - * need N vectors where N is the number of choosemsr in the longest EMIT block since - * it needs to retain all of the choices made as part of each descent. - * - * See crush_msr_choose for details. - * - * @map: the crush_map - * @ruleno: the rule id - * @x: hash input - * @result: pointer to result vector - * @result_max: maximum result size - * @weight: weight vector (for map leaves) - * @weight_max: size of weight vector - * @cwin: Pointer to at least map->working_size bytes of memory or NULL. - */ -static int crush_msr_do_rule( - const struct crush_map *map, - int ruleno, int map_input, int *result, int result_max, - const __u32 *weight, int weight_max, - void *cwin, const struct crush_choose_arg *choose_args) -{ - unsigned msr_descents = map->msr_descents; - unsigned msr_collision_tries = map->msr_collision_tries; - struct crush_rule *rule = map->rules[ruleno]; - unsigned start_stepno = crush_msr_scan_config_steps( - rule->steps, rule->len, - &msr_descents, &msr_collision_tries); - - struct crush_msr_input input = { - .map = map, - .rule = map->rules[ruleno], - .result_max = result_max, - .weight_len = weight_max, - .weights = weight, - .map_input = map_input, - .choose_args = choose_args, - .msr_descents = msr_descents, - .msr_collision_tries = msr_collision_tries - }; - - struct crush_msr_output output = { - .result_len = result_max, - .returned_so_far = 0, - .out = result - }; - for (unsigned i = 0; i < output.result_len; ++i) { - output.out[i] = CRUSH_ITEM_NONE; - } - - unsigned start_index = 0; - while (start_stepno < input.rule->len) { - unsigned emit_stepno, total_children; - if (crush_msr_scan_next( - input.rule, input.result_max, - start_stepno, &total_children, - &emit_stepno) != 0) { - // invalid rule, return whatever we have - dprintk("crush_msr_scan_returned -1\n"); - return 0; - } - - const struct crush_rule_step *take_step = - &(input.rule->steps[start_stepno]); - BUG_ON(take_step->op != CRUSH_RULE_TAKE); - - if (take_step->arg1 >= 0) { - if (start_stepno + 1 != emit_stepno) { - // invalid rule - dprintk("take step specifies osd, but " - "there are subsequent choose steps\n"); - return 0; - } else { - crush_msr_emit_result( - &output, input.rule->type, - start_index, take_step->arg1); - } - } else { - dprintk("start_stepno %d\n", start_stepno); - dprintk("root bucket: %d\n", - input.rule->steps[start_stepno].arg1); - struct crush_bucket *root_bucket = input.map->buckets[ - -1 - input.rule->steps[start_stepno].arg1]; - dprintk( - "root bucket: %d %p\n", - input.rule->steps[start_stepno].arg1, - root_bucket); - - ++start_stepno; - BUG_ON(emit_stepno >= input.rule->len); - BUG_ON(emit_stepno < start_stepno); - BUG_ON(start_stepno >= input.rule->len); - - struct crush_work *cw = cwin; - int *out_vecs[input.rule->len]; - for (unsigned stepno = 0; stepno < input.rule->len; ++stepno) { - out_vecs[stepno] = (int*)((char*)cw + map->working_size) + - (stepno * result_max); - } - struct crush_msr_workspace workspace = { - .start_stepno = start_stepno, - .end_stepno = emit_stepno, - .result_len = result_max, - .crush_work = cw, - .step_vecs = out_vecs - }; - crush_msr_clear_workspace(&workspace); - - - unsigned tries_so_far = 0; - unsigned end_index = MIN(start_index + total_children, - input.result_max); - while (tries_so_far <= input.msr_descents && - output.returned_so_far < input.result_max) { - crush_msr_choose( - &input, &workspace, &output, - root_bucket, - total_children, - start_index, - end_index, - start_stepno, emit_stepno, - tries_so_far); - dprintk("returned_so_far: %d\n", - output.returned_so_far); - ++tries_so_far; - } - start_index = end_index; - start_stepno = emit_stepno + 1; - } - } - - if (rule->type == CRUSH_RULE_TYPE_MSR_FIRSTN) { - return output.returned_so_far; - } else { - return input.result_max; - } -} - -/// Return 1 if msr, 0 otherwise -static int rule_type_is_msr(int type) -{ - return type == CRUSH_RULE_TYPE_MSR_FIRSTN || - type == CRUSH_RULE_TYPE_MSR_INDEP; -} - -size_t crush_work_size(const struct crush_map *map, - int result_max) -{ - unsigned ruleno; - unsigned out_vecs = 3; /* normal do_rule needs 3 outvecs */ - for (ruleno = 0; ruleno < map->max_rules; ++ruleno) { - const struct crush_rule *rule = map->rules[ruleno]; - if (!rule) continue; - if (!rule_type_is_msr(rule->type)) - continue; - unsigned rule_max_msr_steps; - // we ignore the return value because rule_max_msr_steps will be - // populated with the longest step sequence before hitting - // the error - crush_msr_scan_rule(rule, result_max, &rule_max_msr_steps); - out_vecs = MAX(rule_max_msr_steps, out_vecs); - } - return map->working_size + result_max * out_vecs * sizeof(__u32); -} - -/* This takes a chunk of memory and sets it up to be a shiny new - working area for a CRUSH placement computation. It must be called - on any newly allocated memory before passing it in to - crush_do_rule. It may be used repeatedly after that, so long as the - map has not changed. If the map /has/ changed, you must make sure - the working size is no smaller than what was allocated and re-run - crush_init_workspace. - - If you do retain the working space between calls to crush, make it - thread-local. If you reinstitute the locking I've spent so much - time getting rid of, I will be very unhappy with you. */ - -void crush_init_workspace(const struct crush_map *m, void *v) { - /* We work by moving through the available space and setting - values and pointers as we go. - - It's a bit like Forth's use of the 'allot' word since we - set the pointer first and then reserve the space for it to - point to by incrementing the point. */ - struct crush_work *w = (struct crush_work *)v; - char *point = (char *)v; - __s32 b; - point += sizeof(struct crush_work); - w->work = (struct crush_work_bucket **)point; - point += m->max_buckets * sizeof(struct crush_work_bucket *); - for (b = 0; b < m->max_buckets; ++b) { - if (m->buckets[b] == 0) - continue; - - w->work[b] = (struct crush_work_bucket *) point; - switch (m->buckets[b]->alg) { - default: - point += sizeof(struct crush_work_bucket); - break; - } - w->work[b]->perm_x = 0; - w->work[b]->perm_n = 0; - w->work[b]->perm = (__u32 *)point; - point += m->buckets[b]->size * sizeof(__u32); - } - BUG_ON((char *)point - (char *)w != m->working_size); -} - -/** - * crush_do_rule - calculate a mapping with the given input and rule - * @map: the crush_map - * @ruleno: the rule id - * @x: hash input - * @result: pointer to result vector - * @result_max: maximum result size - * @weight: weight vector (for map leaves) - * @weight_max: size of weight vector - * @cwin: Pointer to at least map->working_size bytes of memory or NULL. - */ -int crush_do_rule(const struct crush_map *map, - int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max, - void *cwin, const struct crush_choose_arg *choose_args) -{ - const struct crush_rule *rule; - - if ((__u32)ruleno >= map->max_rules) { - dprintk(" bad ruleno %d\n", ruleno); - return 0; - } - - rule = map->rules[ruleno]; - if (rule_type_is_msr(rule->type)) { - return crush_msr_do_rule( - map, - ruleno, - x, - result, - result_max, - weight, - weight_max, - cwin, - choose_args); - } else { - return crush_do_rule_no_retry( - map, - ruleno, - x, - result, - result_max, - weight, - weight_max, - cwin, - choose_args); - } -} diff --git a/src/crush/mapper.h b/src/crush/mapper.h index 98c7bf11c0d99..0ec927d9e6162 100644 --- a/src/crush/mapper.h +++ b/src/crush/mapper.h @@ -77,11 +77,15 @@ extern int crush_do_rule(const struct crush_map *map, const __u32 *weights, int weight_max, void *cwin, const struct crush_choose_arg *choose_args); -/* Returns enough workspace for any crush rule within map to generate - result_max outputs. The caller can then allocate this much on its own, - either on the stack, in a per-thread long-lived buffer, or however it likes.*/ -extern size_t crush_work_size(const struct crush_map *map, - int result_max); +/* Returns the exact amount of workspace that will need to be used + for a given combination of crush_map and result_max. The caller can + then allocate this much on its own, either on the stack, in a + per-thread long-lived buffer, or however it likes. */ + +static inline size_t crush_work_size(const struct crush_map *map, + int result_max) { + return map->working_size + result_max * 3 * sizeof(__u32); +} extern void crush_init_workspace(const struct crush_map *m, void *v); diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc index 928d05f2adb0a..5212baee25187 100644 --- a/src/erasure-code/ErasureCode.cc +++ b/src/erasure-code/ErasureCode.cc @@ -52,12 +52,6 @@ int ErasureCode::init( err |= to_string("crush-failure-domain", profile, &rule_failure_domain, DEFAULT_RULE_FAILURE_DOMAIN, ss); - err |= to_int("crush-osds-per-failure-domain", profile, - &rule_osds_per_failure_domain, - "0", ss); - err |= to_int("crush-num-failure-domains", profile, - &rule_num_failure_domains, - "0", ss); err |= to_string("crush-device-class", profile, &rule_device_class, "", ss); @@ -72,33 +66,19 @@ int ErasureCode::create_rule( CrushWrapper &crush, std::ostream *ss) const { - if (rule_osds_per_failure_domain <= 1) { - return crush.add_simple_rule( - name, - rule_root, - rule_failure_domain, - rule_num_failure_domains, - rule_device_class, - "indep", - pg_pool_t::TYPE_ERASURE, - ss); - } else { - if (rule_num_failure_domains < 1) { - if (ss) { - *ss << "crush-num-failure-domains " << rule_num_failure_domains - << " must be >= 1 if crush-osds-per-failure-domain specified"; - return -EINVAL; - } - } - return crush.add_indep_multi_osd_per_failure_domain_rule( - name, - rule_root, - rule_failure_domain, - rule_num_failure_domains, - rule_osds_per_failure_domain, - rule_device_class, - ss); - } + int ruleid = crush.add_simple_rule( + name, + rule_root, + rule_failure_domain, + rule_device_class, + "indep", + pg_pool_t::TYPE_ERASURE, + ss); + + if (ruleid < 0) + return ruleid; + + return ruleid; } int ErasureCode::sanity_check_k_m(int k, int m, ostream *ss) diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h index fd6d1a41f714d..c246d5dc6b67d 100644 --- a/src/erasure-code/ErasureCode.h +++ b/src/erasure-code/ErasureCode.h @@ -37,8 +37,6 @@ namespace ceph { std::string rule_root; std::string rule_failure_domain; std::string rule_device_class; - int rule_osds_per_failure_domain = -1; - int rule_num_failure_domains = -1; ~ErasureCode() override {} diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h index 23175adfa2c80..1937eeb4c6987 100644 --- a/src/include/ceph_features.h +++ b/src/include/ceph_features.h @@ -137,7 +137,7 @@ DEFINE_CEPH_FEATURE(34, 3, RANGE_BLOCKLIST) DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) // 3.14 DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) // 3.14 DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) // 3.14 -DEFINE_CEPH_FEATURE(38, 2, CRUSH_MSR) // X.XX TODOSAM kernel version? +DEFINE_CEPH_FEATURE_RETIRED(38, 1, OSD_ERASURE_CODES, MIMIC, OCTOPUS) // available DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) // 3.15 DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) // 3.19 @@ -218,7 +218,6 @@ DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client- CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER | \ - CEPH_FEATURE_CRUSH_MSR | \ CEPH_FEATURE_OSDMAP_ENC | \ CEPH_FEATURE_MDS_INLINE_DATA | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ @@ -266,10 +265,9 @@ DEFINE_CEPH_FEATURE_RETIRED(63, 1, RESERVED_BROKEN, LUMINOUS, QUINCY) // client- CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \ - CEPH_FEATURE_CRUSH_MSR | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_CRUSH_V4 | \ - CEPH_FEATUREMASK_CRUSH_MSR) + CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS) /* * make sure we don't try to use the reserved features diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 696d7f3185b36..f8e379326f25f 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -7562,12 +7562,6 @@ bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush, << newmap.require_min_compat_client; return false; } - if (mv > newmap.require_osd_release) { - ss << "new crush map requires client version " << mv - << " but require_osd_release is " - << newmap.require_osd_release; - return false; - } } // osd compat @@ -8078,7 +8072,7 @@ int OSDMonitor::prepare_new_pool(string& name, return r; } - if (!osdmap.crush->rule_valid_for_pool_type(crush_rule, pool_type)) { + if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) { *ss << "crush rule " << crush_rule << " type does not match pool"; return -EINVAL; } @@ -8350,7 +8344,7 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, return -EPERM; } } - if (!osdmap.crush->rule_valid_for_pool_type(p.get_crush_rule(), p.type)) { + if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) { ss << "crush rule " << p.get_crush_rule() << " type does not match pool"; return -EINVAL; } @@ -8583,7 +8577,7 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, ss << cpp_strerror(id); return -ENOENT; } - if (!osdmap.crush->rule_valid_for_pool_type(id, p.get_type())) { + if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) { ss << "crush rule " << id << " type does not match pool"; return -EINVAL; } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 76552333dfff0..5773695b77ad3 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -1764,10 +1764,9 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const features |= CEPH_FEATURE_CRUSH_V4; if (crush->has_nondefault_tunables5()) features |= CEPH_FEATURE_CRUSH_TUNABLES5; - if (crush->has_incompat_choose_args()) + if (crush->has_incompat_choose_args()) { features |= CEPH_FEATUREMASK_CRUSH_CHOOSE_ARGS; - if (crush->has_nondefault_tunables_msr()) - features |= CEPH_FEATURE_CRUSH_MSR; + } mask |= CEPH_FEATURES_CRUSH; if (!pg_upmap.empty() || !pg_upmap_items.empty() || !pg_upmap_primaries.empty()) @@ -1790,8 +1789,6 @@ uint64_t OSDMap::get_features(int entity_type, uint64_t *pmask) const features |= CEPH_FEATURE_CRUSH_TUNABLES3; if (crush->is_v5_rule(ruleid)) features |= CEPH_FEATURE_CRUSH_TUNABLES5; - if (crush->is_msr_rule(ruleid)) - features |= CEPH_FEATURE_CRUSH_MSR; } } mask |= CEPH_FEATURE_OSDHASHPSPOOL | CEPH_FEATURE_OSD_CACHEPOOL; @@ -1846,9 +1843,6 @@ ceph_release_t OSDMap::get_min_compat_client() const { uint64_t f = get_features(CEPH_ENTITY_TYPE_CLIENT, nullptr); - if (HAVE_FEATURE(f, CRUSH_MSR)) { // TODOSAM -- add version right before merge - return ceph_release_t::squid; // v19.2.0 - } if (HAVE_FEATURE(f, OSDMAP_PG_UPMAP) || // v12.0.0-1733-g27d6f43 HAVE_FEATURE(f, CRUSH_CHOOSE_ARGS)) { // v12.0.1-2172-gef1ef28 return ceph_release_t::luminous; // v12.2.0 @@ -4530,7 +4524,7 @@ int OSDMap::validate_crush_rules(CrushWrapper *newcrush, << " but it is not present"; return -EINVAL; } - if (!newcrush->rule_valid_for_pool_type(ruleno, pool.get_type())) { + if (newcrush->get_rule_type(ruleno) != (int)pool.get_type()) { *ss << "pool " << i.first << " type does not match rule " << ruleno; return -EINVAL; } diff --git a/src/test/cli/crushtool/choose-args.t b/src/test/cli/crushtool/choose-args.t index 99120f0f211fe..e0956ec0a754d 100644 --- a/src/test/cli/crushtool/choose-args.t +++ b/src/test/cli/crushtool/choose-args.t @@ -159,8 +159,6 @@ "chooseleaf_descend_once": 0, "chooseleaf_vary_r": 0, "chooseleaf_stable": 0, - "msr_descents": 100, - "msr_collision_tries": 100, "straw_calc_version": 0, "allowed_bucket_algs": 22, "profile": "argonaut", @@ -174,8 +172,7 @@ "has_v3_rules": 0, "has_v4_buckets": 1, "require_feature_tunables5": 0, - "has_v5_rules": 0, - "has_msr_rules": 0 + "has_v5_rules": 0 }, "choose_args": { "1": [], diff --git a/src/test/cli/osdmaptool/crush.t b/src/test/cli/osdmaptool/crush.t index 695c4e0c22d9d..520f11e50d5cf 100644 --- a/src/test/cli/osdmaptool/crush.t +++ b/src/test/cli/osdmaptool/crush.t @@ -6,7 +6,7 @@ osdmaptool: exported crush map to oc $ osdmaptool --import-crush oc myosdmap osdmaptool: osdmap file 'myosdmap' - osdmaptool: imported 499 byte crush map from oc + osdmaptool: imported 497 byte crush map from oc osdmaptool: writing epoch 3 to myosdmap $ osdmaptool --adjust-crush-weight 0:5 myosdmap osdmaptool: osdmap file 'myosdmap' diff --git a/src/test/crush/crush.cc b/src/test/crush/crush.cc index 9e2a2c99fd224..1f53084a70afb 100644 --- a/src/test/crush/crush.cc +++ b/src/test/crush/crush.cc @@ -18,11 +18,69 @@ #include "include/stringify.h" #include "crush/CrushWrapper.h" -#include "crush/CrushCompiler.h" #include "osd/osd_types.h" using namespace std; +std::unique_ptr build_indep_map(CephContext *cct, int num_rack, + int num_host, int num_osd) +{ + std::unique_ptr c(new CrushWrapper); + c->create(); + + c->set_type_name(5, "root"); + c->set_type_name(4, "row"); + c->set_type_name(3, "rack"); + c->set_type_name(2, "chasis"); + c->set_type_name(1, "host"); + c->set_type_name(0, "osd"); + + int rootno; + c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1, + 5, 0, NULL, NULL, &rootno); + c->set_item_name(rootno, "default"); + + map loc; + loc["root"] = "default"; + + int osd = 0; + for (int r=0; rinsert_item(cct, osd, 1.0, string("osd.") + stringify(osd), loc); + } + } + } + int ret; + int ruleno = 0; + ret = c->add_rule(ruleno, 4, 123); + ceph_assert(ret == ruleno); + ret = c->set_rule_step(ruleno, 0, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 10, 0); + ceph_assert(ret == 0); + ret = c->set_rule_step(ruleno, 1, CRUSH_RULE_TAKE, rootno, 0); + ceph_assert(ret == 0); + ret = c->set_rule_step(ruleno, 2, CRUSH_RULE_CHOOSELEAF_INDEP, CRUSH_CHOOSE_N, 1); + ceph_assert(ret == 0); + ret = c->set_rule_step(ruleno, 3, CRUSH_RULE_EMIT, 0, 0); + ceph_assert(ret == 0); + c->set_rule_name(ruleno, "data"); + + c->finalize(); + + if (false) { + Formatter *f = Formatter::create("json-pretty"); + f->open_object_section("crush_map"); + c->dump(f); + f->close_section(); + f->flush(cout); + delete f; + } + + return c; +} + int get_num_dups(const vector& v) { std::set s; @@ -36,21 +94,7 @@ int get_num_dups(const vector& v) return dups; } -class RuleType { - bool msr; - -public: - RuleType(bool msr) : msr(msr) {} - - bool is_msr() const { return msr; } - - friend std::ostream &operator<<(std::ostream &, RuleType); -}; -std::ostream &operator<<(std::ostream &lhs, RuleType rhs) { - return lhs << (rhs.msr ? "MSR" : "NORMAL"); -} - -class IndepTest : public ::testing::TestWithParam +class CRUSHTest : public ::testing::Test { public: void SetUp() final @@ -64,91 +108,11 @@ class IndepTest : public ::testing::TestWithParam cct->put(); cct = nullptr; } - - std::unique_ptr build_indep_map( - CephContext *cct, int num_rack, int num_host, int num_osd) - { - std::unique_ptr c(new CrushWrapper); - c->create(); - c->set_tunables_optimal(); - - c->set_type_name(5, "root"); - c->set_type_name(4, "row"); - c->set_type_name(3, "rack"); - c->set_type_name(2, "chasis"); - c->set_type_name(1, "host"); - c->set_type_name(0, "osd"); - - int rootno; - c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1, - 5, 0, NULL, NULL, &rootno); - c->set_item_name(rootno, "default"); - - map loc; - loc["root"] = "default"; - - int osd = 0; - for (int r=0; rinsert_item(cct, osd, 1.0, string("osd.") + stringify(osd), loc); - } - } - } - int ret; - int ruleno = 0; - - if (GetParam().is_msr()) { - unsigned step_id = 0; - ret = c->add_rule(ruleno, 4, CRUSH_RULE_TYPE_MSR_INDEP); - ceph_assert(ret == ruleno); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_TAKE, rootno, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, CRUSH_CHOOSE_N, 1); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 1, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0); - ceph_assert(ret == 0); - } else { - unsigned step_id = 0; - ret = c->add_rule(ruleno, 4, CRUSH_RULE_TYPE_ERASURE); - ceph_assert(ret == ruleno); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 10, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_TAKE, rootno, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSELEAF_INDEP, CRUSH_CHOOSE_N, 1); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0); - ceph_assert(ret == 0); - } - - c->set_rule_name(ruleno, "data"); - c->finalize(); - - if (false) { - Formatter *f = Formatter::create("json-pretty"); - f->open_object_section("crush_map"); - c->dump(f); - f->close_section(); - f->flush(cout); - delete f; - } - - return c; - } - protected: CephContext *cct = nullptr; }; -TEST_P(IndepTest, toosmall) { +TEST_F(CRUSHTest, indep_toosmall) { std::unique_ptr c(build_indep_map(cct, 1, 3, 1)); vector<__u32> weight(c->get_max_devices(), 0x10000); c->dump_tree(&cout, NULL); @@ -167,7 +131,7 @@ TEST_P(IndepTest, toosmall) { } } -TEST_P(IndepTest, basic) { +TEST_F(CRUSHTest, indep_basic) { std::unique_ptr c(build_indep_map(cct, 3, 3, 3)); vector<__u32> weight(c->get_max_devices(), 0x10000); c->dump_tree(&cout, NULL); @@ -186,88 +150,7 @@ TEST_P(IndepTest, basic) { } } -TEST_P(IndepTest, single_out_first) { - std::unique_ptr c(build_indep_map(cct, 3, 3, 3)); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 1000; ++x) { - vector<__u32> weight(c->get_max_devices(), 0x10000); - vector out; - c->do_rule(0, x, out, 5, weight, 0); - - int num_none = 0; - for (unsigned i=0; i out2; - c->do_rule(0, x, out2, 5, weight, 0); - - cout << "input " << x - << " marked out " << out[0] - << " out " << out - << " -> out2 " << out2 - << std::endl; - - // First item should have been remapped - ASSERT_NE(CRUSH_ITEM_NONE, out2[0]); - ASSERT_NE(out[0], out2[0]); - for (unsigned i=1; i c(build_indep_map(cct, 3, 3, 3)); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 1000; ++x) { - vector<__u32> weight(c->get_max_devices(), 0x10000); - vector out; - c->do_rule(0, x, out, 5, weight, 0); - - int num_none = 0; - for (unsigned i=0; i out2; - c->do_rule(0, x, out2, 5, weight, 0); - - cout << "input " << x - << " marked out " << out[0] - << " out " << out - << " -> out2 " << out2 - << std::endl; - - // Last - ASSERT_NE(CRUSH_ITEM_NONE, out2[last]); - ASSERT_NE(out[last], out2[last]); - for (unsigned i=0; i c(build_indep_map(cct, 3, 3, 3)); vector<__u32> weight(c->get_max_devices(), 0x10000); @@ -293,7 +176,7 @@ TEST_P(IndepTest, out_alt) { } } -TEST_P(IndepTest, out_contig) { +TEST_F(CRUSHTest, indep_out_contig) { std::unique_ptr c(build_indep_map(cct, 3, 3, 3)); vector<__u32> weight(c->get_max_devices(), 0x10000); @@ -318,7 +201,8 @@ TEST_P(IndepTest, out_contig) { } } -TEST_P(IndepTest, out_progressive) { + +TEST_F(CRUSHTest, indep_out_progressive) { std::unique_ptr c(build_indep_map(cct, 3, 3, 3)); c->set_choose_total_tries(100); vector<__u32> tweight(c->get_max_devices(), 0x10000); @@ -333,15 +217,8 @@ TEST_P(IndepTest, out_progressive) { for (unsigned i=0; i out; c->do_rule(0, x, out, 7, weight, 0); - cout << "(" << i << "/" << weight.size() << " out) "; - if (i > 0) cout << "marked out " << i - 1 << " "; - cout << x << " -> " << out << std::endl; - - int num_none = 0; - for (unsigned k=0; k " << out << std::endl; ASSERT_EQ(0, get_num_dups(out)); // make sure nothing moved @@ -361,6 +238,7 @@ TEST_P(IndepTest, out_progressive) { cout << " " << out[j] << " moved from " << pos[out[j]] << " to " << j << std::endl; ++moved; } + //ASSERT_EQ(j, pos[out[j]]); } } if (moved || changed) @@ -382,334 +260,6 @@ TEST_P(IndepTest, out_progressive) { } -INSTANTIATE_TEST_SUITE_P( - IndepTest, - IndepTest, - ::testing::Values(RuleType(true), RuleType(false)), - testing::PrintToStringParamName()); - -class FirstnTest : public ::testing::TestWithParam -{ -public: - void SetUp() final - { - CephInitParameters params(CEPH_ENTITY_TYPE_CLIENT); - cct = common_preinit(params, CODE_ENVIRONMENT_UTILITY, - CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); - } - void TearDown() final - { - cct->put(); - cct = nullptr; - } - - std::unique_ptr build_firstn_map( - CephContext *cct, int num_rack, int num_host, int num_osd) - { - std::unique_ptr c(new CrushWrapper); - c->create(); - c->set_tunables_optimal(); - - c->set_type_name(5, "root"); - c->set_type_name(4, "row"); - c->set_type_name(3, "rack"); - c->set_type_name(2, "chasis"); - c->set_type_name(1, "host"); - c->set_type_name(0, "osd"); - - int rootno; - c->add_bucket(0, CRUSH_BUCKET_STRAW, CRUSH_HASH_RJENKINS1, - 5, 0, NULL, NULL, &rootno); - c->set_item_name(rootno, "default"); - - map loc; - loc["root"] = "default"; - - int osd = 0; - for (int r=0; rinsert_item(cct, osd, 1.0, string("osd.") + stringify(osd), loc); - } - } - } - int ret; - int ruleno = 0; - - if (GetParam().is_msr()) { - unsigned step_id = 0; - ret = c->add_rule(ruleno, 4, CRUSH_RULE_TYPE_MSR_FIRSTN); - ceph_assert(ret == ruleno); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_TAKE, rootno, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, CRUSH_CHOOSE_N, 1); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 1, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0); - ceph_assert(ret == 0); - } else { - unsigned step_id = 0; - ret = c->add_rule(ruleno, 4, CRUSH_RULE_TYPE_ERASURE); - ceph_assert(ret == ruleno); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 0, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_TAKE, rootno, 0); - ceph_assert(ret == 0); - ret = c->set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSELEAF_FIRSTN, CRUSH_CHOOSE_N, 1); - ceph_assert(ret == 0); - ret = c->set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0); - ceph_assert(ret == 0); - } - - c->set_rule_name(ruleno, "data"); - c->finalize(); - - if (false) { - Formatter *f = Formatter::create("json-pretty"); - f->open_object_section("crush_map"); - c->dump(f); - f->close_section(); - f->flush(cout); - delete f; - } - - return c; - } - -protected: - CephContext *cct = nullptr; -}; - -TEST_P(FirstnTest, basic) { - std::unique_ptr c(build_firstn_map(cct, 3, 3, 3)); - vector<__u32> weight(c->get_max_devices(), 0x10000); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 100; ++x) { - vector out; - c->do_rule(0, x, out, 3, weight, 0); - cout << x << " -> " << out << std::endl; - for (unsigned i=0; i c(build_firstn_map(cct, 1, 3, 1)); - vector<__u32> weight(c->get_max_devices(), 0x10000); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 100; ++x) { - vector out; - c->do_rule(0, x, out, 5, weight, 0); - cout << x << " -> " << out << std::endl; - for (unsigned i=0; i c(build_firstn_map(cct, 3, 3, 3)); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 1000; ++x) { - vector<__u32> weight(c->get_max_devices(), 0x10000); - vector out; - c->do_rule(0, x, out, 3, weight, 0); - - for (unsigned i=0; i out2; - c->do_rule(0, x, out2, 3, weight, 0); - - cout << "input " << x - << " marked out " << out[0] - << " out " << out - << " -> out2 " << out2 - << std::endl; - - ASSERT_EQ(3, out2.size()); - ASSERT_EQ(0, get_num_dups(out2)); - for (unsigned i=0; i c(build_firstn_map(cct, 3, 3, 3)); - c->dump_tree(&cout, NULL); - - for (int x = 0; x < 1000; ++x) { - vector<__u32> weight(c->get_max_devices(), 0x10000); - vector out; - c->do_rule(0, x, out, 3, weight, 0); - - for (unsigned i=0; i out2; - c->do_rule(0, x, out2, 3, weight, 0); - - cout << "input " << x - << " marked out " << out[0] - << " out " << out - << " -> out2 " << out2 - << std::endl; - - ASSERT_EQ(3, out2.size()); - ASSERT_EQ(0, get_num_dups(out2)); - for (unsigned i=0; i c(build_firstn_map(cct, 3, 3, 3)); - vector<__u32> weight(c->get_max_devices(), 0x10000); - - // mark a bunch of osds out - int num = 3*3*3; - for (int i=0; idump_tree(&cout, NULL); - - // need more retries to get 9/9 hosts for x in 0..99 - if (!GetParam().is_msr()) { - c->set_choose_total_tries(500); - } - for (int x = 0; x < 100; ++x) { - vector out; - c->do_rule(0, x, out, 9, weight, 0); - cout << x << " -> " << out << std::endl; - ASSERT_EQ(9, out.size()); - ASSERT_EQ(0, get_num_dups(out)); - } -} - -TEST_P(FirstnTest, out_contig) { - std::unique_ptr c(build_firstn_map(cct, 3, 3, 3)); - vector<__u32> weight(c->get_max_devices(), 0x10000); - - // mark a bunch of osds out - int num = 3*3*3; - for (int i=0; idump_tree(&cout, NULL); - - // need more retries to get 7/7 hosts for x in 0..99 - if (!GetParam().is_msr()) { - c->set_choose_total_tries(500); - } - for (int x = 0; x < 100; ++x) { - vector out; - c->do_rule(0, x, out, 7, weight, 0); - cout << x << " -> " << out << std::endl; - ASSERT_EQ(6, out.size()); - ASSERT_EQ(0, get_num_dups(out)); - } -} - -TEST_P(FirstnTest, out_progressive) { - std::unique_ptr c(build_firstn_map(cct, 3, 3, 3)); - if (!GetParam().is_msr()) { - c->set_choose_total_tries(500); - } - vector<__u32> tweight(c->get_max_devices(), 0x10000); - c->dump_tree(&cout, NULL); - - int tchanged = 0; - for (int x = 1; x < 5; ++x) { - vector<__u32> weight(c->get_max_devices(), 0x10000); - - std::set prev; - for (unsigned i=0; i out; - c->do_rule(0, x, out, 7, weight, 0); - cout << "(" << i << "/" << weight.size() << " out) "; - if (i > 0) cout << "marked out " << i - 1 << " "; - cout << x << " -> " << out << std::endl; - - ASSERT_EQ(0, get_num_dups(out)); - - int changed = 0; - for (unsigned j=0; j{out.begin(), out.end()}; - } - } - cout << tchanged << " total changed" << std::endl; -} - -INSTANTIATE_TEST_SUITE_P( - FirstnTest, - FirstnTest, - ::testing::Values(RuleType(true), RuleType(false)), - testing::PrintToStringParamName()); - -class CRUSHTest : public ::testing::Test -{ -public: - void SetUp() final - { - CephInitParameters params(CEPH_ENTITY_TYPE_CLIENT); - cct = common_preinit(params, CODE_ENVIRONMENT_UTILITY, - CINIT_FLAG_NO_DEFAULT_CONFIG_FILE); - } - void TearDown() final - { - cct->put(); - cct = nullptr; - } -protected: - CephContext *cct = nullptr; -}; - TEST_F(CRUSHTest, straw_zero) { // zero weight items should have no effect on placement. @@ -1103,459 +653,3 @@ TEST_F(CRUSHTest, straw2_reweight) { cout << " vs " << estddev << std::endl; } } - -struct cluster_test_spec_t { - const int num_osds_per_host; - const int num_hosts; - - const int num_hosts_mapped; - const int num_mapped_per_host; - const int num_mapped_size; - - const int num_osds; - - cluster_test_spec_t( - int num_osds_per_host, int num_hosts, - int num_hosts_mapped, int num_mapped_per_host, int num_mapped_size) - : num_osds_per_host(num_osds_per_host), num_hosts(num_hosts), - num_hosts_mapped(num_hosts_mapped), - num_mapped_per_host(num_mapped_per_host), - num_mapped_size(num_mapped_size), - num_osds(num_osds_per_host * num_hosts) {} - - void validate_osd(int osd) const { - EXPECT_GE(osd, 0); - EXPECT_LT(osd, num_osds); - } - - bool check_osd(int osd) const { - return osd >= 0 && osd < num_osds; - } - - void validate_host(int host) const { - assert(host >= 0); - assert(host < num_hosts); - } - - std::pair host_to_osd_range(int host) const { - validate_host(host); - auto first = host * num_osds_per_host; - return std::make_pair(first, first + num_osds_per_host); - } - - int osd_to_host(int osd) const { - validate_osd(osd); - return osd / num_osds_per_host; - } -}; - -static constexpr int ROOT_TYPE = 2; -static constexpr int HOST_TYPE = 1; -static constexpr int OSD_TYPE = 0; -std::pair> create_crush_heirarchy( - CephContext *cct, - const cluster_test_spec_t &spec) -{ - auto c = std::make_unique(); - c->create(); - c->set_tunables_optimal(); - - - c->set_type_name(ROOT_TYPE, "root"); - c->set_type_name(HOST_TYPE, "host"); - c->set_type_name(OSD_TYPE, "osd"); - - int rootno; - c->add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1, - ROOT_TYPE, 0, NULL, NULL, &rootno); - c->set_item_name(rootno, "default"); - - for (auto host_id = 0; host_id < spec.num_hosts; ++host_id) { - const std::string host_name = fmt::format("host{}", host_id); - const auto first_host_osd = host_id * spec.num_osds_per_host; - const auto next_first_host_osd = first_host_osd + spec.num_osds_per_host; - for (auto osd_id = first_host_osd; osd_id < next_first_host_osd; ++osd_id) { - const std::string osd_name = fmt::format("osd{}", osd_id); - auto ret = c->insert_item( - cct, osd_id, 1.0, osd_name, - {{ "root", "default"}, {"host", host_name}}); - EXPECT_EQ(ret, 0); - } - } - - c->finalize(); - return std::make_pair(rootno, std::move(c)); -} - -std::vector create_weight_vector( - const cluster_test_spec_t &spec) -{ - return std::vector(spec.num_osds, CEPH_OSD_IN); -} - -std::vector create_weight_vector_first_osd_out( - const cluster_test_spec_t &spec, - const std::vector &mapping) -{ - auto weights = create_weight_vector(spec); - spec.validate_osd(mapping[0]); - weights[mapping[0]] = CEPH_OSD_OUT; - return weights; -} - -std::vector create_weight_vector_first_host_out( - const cluster_test_spec_t &spec, - const std::vector &mapping) -{ - auto weights = create_weight_vector(spec); - const auto [first, end] = spec.host_to_osd_range(spec.osd_to_host(mapping[0])); - for (auto i = first; i < end; ++i) { - weights[i] = CEPH_OSD_OUT; - } - return weights; -} - -enum class mapping_change_t { - SAME, - FAILURE, - SAME_HOST, - NEW_HOST -}; -void compare_mappings( - const cluster_test_spec_t &spec, - const std::vector &before, - const std::vector &after, - mapping_change_t expectation, - const std::pair &range) -{ - const auto &[begin, end] = range; - for (auto i = begin; i < end; ++i) { - switch (expectation) { - case mapping_change_t::SAME: - EXPECT_EQ(before[i], after[i]); - break; - case mapping_change_t::FAILURE: - EXPECT_EQ(CRUSH_ITEM_NONE, after[i]); - break; - case mapping_change_t::SAME_HOST: - EXPECT_NE(before[i], after[i]); - if (!spec.check_osd(after[i])) { - spec.validate_osd(after[i]); - } else { - EXPECT_EQ(spec.osd_to_host(before[i]), spec.osd_to_host(after[i])); - } - break; - case mapping_change_t::NEW_HOST: - EXPECT_NE(before[i], after[i]); - if (!spec.check_osd(after[i])) { - spec.validate_osd(after[i]); - } else { - EXPECT_NE(spec.osd_to_host(before[i]), spec.osd_to_host(after[i])); - } - break; - } - } -} - -std::vector get_mapping( - const cluster_test_spec_t &spec, - CrushWrapper &c, - const std::vector &weights, - int ruleno) -{ - std::vector out; - c.do_rule( - ruleno, 0 /* seed */, out, spec.num_mapped_size, - weights, - 0); - EXPECT_EQ(std::size(out), spec.num_mapped_size); - return out; -} - -unsigned count_mapped(const auto &v) { - unsigned ret = 0; - for (const auto &i : v) ret += (i != CRUSH_ITEM_NONE); - return ret; -} - -TEST_F(CRUSHTest, msr_4_host_2_choose_rule) { - cluster_test_spec_t spec{3, 4, 3, 1, 3}; - auto [rootno, c] = create_crush_heirarchy(cct, spec); - - auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP); - EXPECT_EQ(0, c->set_rule_step_take(ruleno, 0, rootno)); - EXPECT_EQ( - 0, c->set_rule_step_choose_msr(ruleno, 1, spec.num_hosts_mapped, HOST_TYPE)); - EXPECT_EQ( - 0, - c->set_rule_step_choose_msr( - ruleno, 2, 1, OSD_TYPE)); - EXPECT_EQ(0, c->set_rule_step_emit(ruleno, 3)); - - auto weights_all_in = create_weight_vector(spec); - auto before = get_mapping(spec, *c, weights_all_in, ruleno); - for (auto i : before) { spec.validate_osd(i); } - - /* MSR test case. With normal CRUSH, hitting an out osd won't cause - * a retry of the previous step, so marking all of the osds on a host - * out will not cause positions mapped to that pg to remap. - * However, because the above is an MSR rule type, hitting an out osd - * will cause a retry of the previous steps as well. - * See https://tracker.ceph.com/issues/62214 for the original motivation */ - auto weights_host_out = create_weight_vector_first_host_out(spec, before); - auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno); - - CrushCompiler cc{*c, std::cout}; - cc.decompile(std::cout); - - fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", ")); - fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", ")); - fmt::print("before : {}\n", fmt::join(before, ", ")); - fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", ")); - - auto count_mapped = [](const auto &v) { - unsigned ret = 0; - for (const auto &i : v) ret += (i != CRUSH_ITEM_NONE); - return ret; - }; - - EXPECT_EQ(count_mapped(before), count_mapped(after_host_out)); - - auto weights_osd_out = create_weight_vector_first_osd_out(spec, before); - auto after_osd_out = get_mapping(spec, *c, weights_osd_out, ruleno); - EXPECT_EQ(count_mapped(before), count_mapped(after_osd_out)); -} - -TEST_F(CRUSHTest, msr_2_host_2_osd) { - cluster_test_spec_t spec{2, 3, 2, 2, 3}; - auto [rootno, c] = create_crush_heirarchy(cct, spec); - - auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP); - EXPECT_EQ(0, c->set_rule_step_take(ruleno, 0, rootno)); - EXPECT_EQ( - 0, c->set_rule_step_choose_msr(ruleno, 1, spec.num_hosts_mapped, HOST_TYPE)); - EXPECT_EQ( - 0, - c->set_rule_step_choose_msr( - ruleno, 2, spec.num_mapped_per_host, OSD_TYPE)); - EXPECT_EQ(0, c->set_rule_step_emit(ruleno, 3)); - - auto weights_all_in = create_weight_vector(spec); - auto before = get_mapping(spec, *c, weights_all_in, ruleno); - for (auto i : before) { spec.validate_osd(i); } - - fmt::print("before : {}\n", fmt::join(before, ", ")); - ASSERT_EQ(count_mapped(before), 3); - - /* MSR test case. With normal CRUSH, hitting an out osd won't cause - * a retry of the previous step, so marking all of the osds on a host - * out will not cause positions mapped to that pg to remap. - * However, because the above is an MSR rule type, hitting an out osd - * will cause a retry of the previous steps as well. - * See https://tracker.ceph.com/issues/62214 for the original motivation */ - auto weights_host_out = create_weight_vector_first_host_out(spec, before); - auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno); - - CrushCompiler cc{*c, std::cout}; - cc.decompile(std::cout); - - fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", ")); - fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", ")); - fmt::print("before : {}\n", fmt::join(before, ", ")); - fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", ")); - - compare_mappings( - spec, before, after_host_out, mapping_change_t::NEW_HOST, - {0, spec.num_mapped_per_host}); - compare_mappings( - spec, before, after_host_out, mapping_change_t::SAME, - {spec.num_mapped_per_host, spec.num_mapped_size}); -} - -TEST_F(CRUSHTest, msr_5_host_8_6_ec_choose) { - cluster_test_spec_t spec{4, 5, 4, 4, 14}; - auto [rootno, c] = create_crush_heirarchy(cct, spec); - - auto ruleno = c->add_rule(-1, 4, CRUSH_RULE_TYPE_MSR_INDEP); - unsigned step_id = 0; - EXPECT_EQ(0, c->set_rule_step_take(ruleno, step_id++, rootno)); - EXPECT_EQ( - 0, - c->set_rule_step_choose_msr( - ruleno, step_id++, spec.num_hosts_mapped, HOST_TYPE)); - EXPECT_EQ( - 0, - c->set_rule_step_choose_msr( - ruleno, step_id++, spec.num_mapped_per_host, OSD_TYPE)); - EXPECT_EQ(0, c->set_rule_step_emit(ruleno, step_id++)); - - auto weights_all_in = create_weight_vector(spec); - auto before = get_mapping(spec, *c, weights_all_in, ruleno); - for (auto i : before) { spec.validate_osd(i); } - - /* MSR test case. With normal CRUSH, hitting an out osd won't cause - * a retry of the previous step, so marking all of the osds on a host - * out will not cause positions mapped to that pg to remap. - * However, because the above is an MSR rule type, hitting an out osd - * will cause a retry of the previous steps as well. - * See https://tracker.ceph.com/issues/62214 for the original motivation */ - auto weights_host_out = create_weight_vector_first_host_out(spec, before); - auto after_host_out = get_mapping(spec, *c, weights_host_out, ruleno); - - CrushCompiler cc{*c, std::cout}; - cc.decompile(std::cout); - - fmt::print("weights_all_in: {}\n", fmt::join(weights_all_in, ", ")); - fmt::print("weights_host_out: {}\n", fmt::join(weights_host_out, ", ")); - fmt::print("before : {}\n", fmt::join(before, ", ")); - fmt::print("after_host_out: {}\n", fmt::join(after_host_out, ", ")); - - compare_mappings( - spec, before, after_host_out, mapping_change_t::NEW_HOST, - {0, spec.num_mapped_per_host}); - compare_mappings( - spec, before, after_host_out, mapping_change_t::SAME, - {spec.num_mapped_per_host, spec.num_mapped_size}); -} - -TEST_F(CRUSHTest, msr_multi_root) { - constexpr unsigned NUM_HOSTS = 4; - constexpr unsigned NUM_OSDS_PER_HOST = 3; - - auto c = CrushWrapper(); - c.create(); - c.set_tunables_optimal(); - - c.set_type_name(ROOT_TYPE, "root"); - c.set_type_name(HOST_TYPE, "host"); - c.set_type_name(OSD_TYPE, "osd"); - - std::map> osd_id_to_host_root; - std::map root_name_to_id; - std::map> host_name_to_osds; - unsigned next_osd_id = 0; - - auto populate_root = [&](const auto &root_name) { - int rootno; - c.add_bucket(0, CRUSH_BUCKET_STRAW2, CRUSH_HASH_RJENKINS1, - ROOT_TYPE, 0, NULL, NULL, &rootno); - c.set_item_name(rootno, root_name); - root_name_to_id[root_name] = rootno; - - for (unsigned host_id = 0; host_id < NUM_HOSTS; ++host_id) { - const std::string host_name = - fmt::format("{}-host{}", root_name, host_id); - for (unsigned osd = 0; osd < NUM_OSDS_PER_HOST; ++osd) { - const int osd_id = next_osd_id++; - const std::string osd_name = fmt::format("{}-osd{}", root_name, osd_id); - auto ret = c.insert_item( - cct, osd_id, 1.0, osd_name, - {{ "root", root_name }, { "host", host_name }}); - osd_id_to_host_root[osd_id] = std::make_pair(host_name, root_name); - host_name_to_osds[host_name].push_back(osd_id); - EXPECT_EQ(ret, 0); - } - } - }; - - int ruleno = 0; - int ret = c.add_rule(ruleno, 8, CRUSH_RULE_TYPE_MSR_INDEP); - ceph_assert(ret == ruleno); - - unsigned step_id = 0; - auto populate_rule = [&](const auto &rule_name) { - ret = c.set_rule_step( - ruleno, step_id++, CRUSH_RULE_TAKE, root_name_to_id[rule_name], 0); - ceph_assert(ret == 0); - ret = c.set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 2, HOST_TYPE); - ceph_assert(ret == 0); - ret = c.set_rule_step( - ruleno, step_id++, CRUSH_RULE_CHOOSE_MSR, 2, OSD_TYPE); - ceph_assert(ret == 0); - ret = c.set_rule_step(ruleno, step_id++, CRUSH_RULE_EMIT, 0, 0); - ceph_assert(ret == 0); - }; - - for (const auto &root_name : { "ssd", "hdd" }) { - populate_root(root_name); - populate_rule(root_name); - } - c.set_rule_name(ruleno, "rule_name"); - c.finalize(); - - constexpr unsigned ACTING_SIZE = 8; - constexpr unsigned OSDS_PER_ROOT = 4; - constexpr unsigned OSDS_PER_HOST = 2; - auto validate_output = [&](const auto &out) { - std::set hosts; - for (unsigned host = 0; host < (ACTING_SIZE / OSDS_PER_HOST); ++host) { - std::set hosts_this_failure_domain; - unsigned start = host * OSDS_PER_HOST; - unsigned end = (host + 1) * OSDS_PER_HOST; - for (unsigned i = start; i < end; ++i) { - EXPECT_NE(out[i], CRUSH_ITEM_NONE); - EXPECT_EQ(osd_id_to_host_root.count(out[i]), 1); - const auto &[host_name, root_name] = osd_id_to_host_root[out[start]]; - EXPECT_EQ(i < OSDS_PER_ROOT ? "ssd" : "hdd", root_name); - hosts_this_failure_domain.insert(host_name); - } - for (const auto &i: hosts_this_failure_domain) { - EXPECT_EQ(hosts.count(i), 0); - hosts.insert(i); - } - } - }; - - const std::vector all_in(next_osd_id, CEPH_OSD_IN); - for (int x = 0; x < 1000; ++x) { - std::vector out; - c.do_rule(ruleno, x, out, 8, all_in, 0); - EXPECT_EQ(count_mapped(out), 8); - validate_output(out); - - { - std::vector osds_out_weight = all_in; - std::set osd_idx_out{{1, 5}}; - for (const auto &i: osd_idx_out) { - osds_out_weight[out[i]] = CEPH_OSD_OUT; - } - std::vector osds_out; - c.do_rule(ruleno, x, osds_out, 8, osds_out_weight, 0); - EXPECT_EQ(count_mapped(osds_out), 8); - validate_output(osds_out); - for (unsigned i = 0; i < osds_out.size(); ++i) { - if (osd_idx_out.count(i)) { - EXPECT_NE(osds_out[i], out[i]); - } else { - EXPECT_EQ(osds_out[i], out[i]); - } - } - } - - { - std::vector hosts_out_weight = all_in; - std::set osd_ids_out; - - for (const auto &i : {2, 6}) { - const auto &[host_name, _] = osd_id_to_host_root[out[i]]; - for (const auto &osd_id: host_name_to_osds[host_name]) { - osd_ids_out.insert(osd_id); - hosts_out_weight[osd_id] = CEPH_OSD_OUT; - } - } - - std::vector hosts_out; - c.do_rule(ruleno, x, hosts_out, 8, hosts_out_weight, 0); - EXPECT_EQ(count_mapped(hosts_out), 8); - validate_output(hosts_out); - for (unsigned i = 0; i < hosts_out.size(); ++i) { - if (osd_ids_out.count(out[i])) { - EXPECT_NE(hosts_out[i], out[i]); - } else { - EXPECT_EQ(hosts_out[i], out[i]); - } - } - } - } -} diff --git a/src/vstart.sh b/src/vstart.sh index 13155003ab47e..0c9ef32377910 100755 --- a/src/vstart.sh +++ b/src/vstart.sh @@ -176,9 +176,6 @@ zoned_enabled=0 io_uring_enabled=0 with_jaeger=0 force_addr=0 -osds_per_host=0 -require_osd_and_client_version="" -use_crush_tunables="" with_mgr_dashboard=true if [[ "$(get_cmake_variable WITH_MGR_DASHBOARD_FRONTEND)" != "ON" ]] || @@ -602,21 +599,6 @@ case $1 in with_jaeger=1 echo "with_jaeger $with_jaeger" ;; - --osds-per-host) - osds_per_host="$2" - shift - echo "osds_per_host $osds_per_host" - ;; - --require-osd-and-client-version) - require_osd_and_client_version="$2" - shift - echo "require_osd_and_client_version $require_osd_and_client_version" - ;; - --use-crush-tunables) - use_crush_tunables="$2" - shift - echo "use_crush_tunables $use_crush_tunables" - ;; *) usage_exit esac @@ -1113,15 +1095,6 @@ EOF if [ "$crimson" -eq 1 ]; then $CEPH_BIN/ceph osd set-allow-crimson --yes-i-really-mean-it fi - - if [ -n "$require_osd_and_client_version" ]; then - $CEPH_BIN/ceph osd set-require-min-compat-client $require_osd_and_client_version - $CEPH_BIN/ceph osd require-osd-release $require_osd_and_client_version --yes-i-really-mean-it - fi - - if [ -n "$use_crush_tunables" ]; then - $CEPH_BIN/ceph osd crush tunables $use_crush_tunables - fi } start_osd() { @@ -1155,13 +1128,6 @@ start_osd() { [osd.$osd] host = $HOSTNAME EOF - - if [ "$osds_per_host" -gt 0 ]; then - wconf <