aboutsummaryrefslogtreecommitdiff
path: root/net/ceph/crush/mapper.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ceph/crush/mapper.c')
-rw-r--r--net/ceph/crush/mapper.c484
1 files changed, 349 insertions, 135 deletions
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b79747c4b64..a1ef53c0441 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -20,6 +20,7 @@
#include <linux/crush/crush.h>
#include <linux/crush/hash.h>
+#include <linux/crush/mapper.h>
/*
* Implement the core CRUSH mapping algorithm.
@@ -32,9 +33,9 @@
* @type: storage ruleset type (user defined)
* @size: output set size
*/
-int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
+int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size)
{
- int i;
+ __u32 i;
for (i = 0; i < map->max_rules; i++) {
if (map->rules[i] &&
@@ -68,11 +69,11 @@ int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
static int bucket_perm_choose(struct crush_bucket *bucket,
int x, int r)
{
- unsigned pr = r % bucket->size;
- unsigned i, s;
+ unsigned int pr = r % bucket->size;
+ unsigned int i, s;
/* start a new permutation if @x has changed */
- if (bucket->perm_x != x || bucket->perm_n == 0) {
+ if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
dprintk("bucket %d new x=%d\n", bucket->id, x);
bucket->perm_x = x;
@@ -100,13 +101,13 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
for (i = 0; i < bucket->perm_n; i++)
dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
while (bucket->perm_n <= pr) {
- unsigned p = bucket->perm_n;
+ unsigned int p = bucket->perm_n;
/* no point in swapping the final entry */
if (p < bucket->size - 1) {
i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
(bucket->size - p);
if (i) {
- unsigned t = bucket->perm[p + i];
+ unsigned int t = bucket->perm[p + i];
bucket->perm[p + i] = bucket->perm[p];
bucket->perm[p] = t;
}
@@ -152,8 +153,8 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
return bucket->h.items[i];
}
- BUG_ON(1);
- return 0;
+ dprintk("bad list sums for bucket %d\n", bucket->h.id);
+ return bucket->h.items[0];
}
@@ -188,7 +189,7 @@ static int terminal(int x)
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
int x, int r)
{
- int n, l;
+ int n;
__u32 w;
__u64 t;
@@ -196,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
n = bucket->num_nodes >> 1;
while (!terminal(n)) {
+ int l;
/* pick point in [0, w) */
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -219,7 +221,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
static int bucket_straw_choose(struct crush_bucket_straw *bucket,
int x, int r)
{
- int i;
+ __u32 i;
int high = 0;
__u64 high_draw = 0;
__u64 draw;
@@ -239,6 +241,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
{
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
+ BUG_ON(in->size == 0);
switch (in->alg) {
case CRUSH_BUCKET_UNIFORM:
return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -253,7 +256,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
return bucket_straw_choose((struct crush_bucket_straw *)in,
x, r);
default:
- BUG_ON(1);
+ dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0];
}
}
@@ -262,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
*/
-static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
{
+ if (item >= weight_max)
+ return 1;
if (weight[item] >= 0x10000)
return 0;
if (weight[item] == 0)
@@ -275,7 +282,7 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
}
/**
- * crush_choose - choose numrep distinct items of given type
+ * crush_choose_firstn - choose numrep distinct items of given type
* @map: the crush_map
* @bucket: the bucket we are choose an item from
* @x: crush input value
@@ -283,20 +290,31 @@ static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
*/
-static int crush_choose(struct crush_map *map,
- struct crush_bucket *bucket,
- __u32 *weight,
- int x, int numrep, int type,
- int *out, int outpos,
- int firstn, int recurse_to_leaf,
- int *out2)
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
+ int recurse_to_leaf,
+ unsigned int vary_r,
+ int *out2,
+ int parent_r)
{
int rep;
- int ftotal, flocal;
+ unsigned int ftotal, flocal;
int retry_descent, retry_bucket, skip_rep;
struct crush_bucket *in = bucket;
int r;
@@ -304,10 +322,12 @@ static int crush_choose(struct crush_map *map,
int item = 0;
int itemtype;
int collide, reject;
- const int orig_tries = 5; /* attempts before we fall back to search */
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
- bucket->id, x, outpos, numrep);
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r);
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
@@ -322,39 +342,26 @@ static int crush_choose(struct crush_map *map,
do {
collide = 0;
retry_bucket = 0;
- r = rep;
- if (in->alg == CRUSH_BUCKET_UNIFORM) {
- /* be careful */
- if (firstn || numrep >= in->size)
- /* r' = r + f_total */
- r += ftotal;
- else if (in->size % numrep == 0)
- /* r'=r+(n+1)*f_local */
- r += (numrep+1) *
- (flocal+ftotal);
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- } else {
- if (firstn)
- /* r' = r + f_total */
- r += ftotal;
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- }
+ r = rep + parent_r;
+ /* r' = r + f_total */
+ r += ftotal;
/* bucket choose */
if (in->size == 0) {
reject = 1;
goto reject;
}
- if (flocal >= (in->size>>1) &&
- flocal > orig_tries)
+ if (local_fallback_retries > 0 &&
+ flocal >= (in->size>>1) &&
+ flocal > local_fallback_retries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
- BUG_ON(item >= map->max_devices);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ skip_rep = 1;
+ break;
+ }
/* desired type? */
if (item < 0)
@@ -365,8 +372,12 @@ static int crush_choose(struct crush_map *map,
/* keep going? */
if (itemtype != type) {
- BUG_ON(item >= 0 ||
- (-1-item) >= map->max_buckets);
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ skip_rep = 1;
+ break;
+ }
in = map->buckets[-1-item];
retry_bucket = 1;
continue;
@@ -381,15 +392,25 @@ static int crush_choose(struct crush_map *map,
}
reject = 0;
- if (recurse_to_leaf) {
+ if (!collide && recurse_to_leaf) {
if (item < 0) {
- if (crush_choose(map,
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
+ if (crush_choose_firstn(map,
map->buckets[-1-item],
- weight,
+ weight, weight_max,
x, outpos+1, 0,
out2, outpos,
- firstn, 0,
- NULL) <= outpos)
+ recurse_tries, 0,
+ local_retries,
+ local_fallback_retries,
+ 0,
+ vary_r,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -402,6 +423,7 @@ static int crush_choose(struct crush_map *map,
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
+ weight_max,
item, x);
else
reject = 0;
@@ -412,20 +434,21 @@ reject:
ftotal++;
flocal++;
- if (collide && flocal < 3)
+ if (collide && flocal <= local_retries)
/* retry locally a few times */
retry_bucket = 1;
- else if (flocal < in->size + orig_tries)
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal < 20)
+ else if (ftotal < tries)
/* then retry descent */
retry_descent = 1;
else
/* else give up */
skip_rep = 1;
dprintk(" reject %d collide %d "
- "ftotal %d flocal %d\n",
+ "ftotal %u flocal %u\n",
reject, collide, ftotal,
flocal);
}
@@ -448,24 +471,179 @@ reject:
/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r)
+{
+ struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(in, x, r);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(map,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
* @ruleno: the rule id
* @x: hash input
* @result: pointer to result vector
* @result_max: maximum result size
- * @force: force initial replica choice; -1 for none
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @scratch: scratch vector for private use; must be >= 3 * result_max
*/
-int crush_do_rule(struct crush_map *map,
+int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
- int force, __u32 *weight)
+ const __u32 *weight, int weight_max,
+ int *scratch)
{
int result_len;
- int force_context[CRUSH_MAX_DEPTH];
- int force_pos = -1;
- int a[CRUSH_MAX_SET];
- int b[CRUSH_MAX_SET];
- int c[CRUSH_MAX_SET];
+ int *a = scratch;
+ int *b = scratch + result_max;
+ int *c = scratch + result_max*2;
int recurse_to_leaf;
int *w;
int wsize = 0;
@@ -473,67 +651,83 @@ int crush_do_rule(struct crush_map *map,
int osize;
int *tmp;
struct crush_rule *rule;
- int step;
+ __u32 step;
int i, j;
int numrep;
- int firstn;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
+ int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
- BUG_ON(ruleno >= map->max_rules);
+ int vary_r = map->chooseleaf_vary_r;
+
+ if ((__u32)ruleno >= map->max_rules) {
+ dprintk(" bad ruleno %d\n", ruleno);
+ return 0;
+ }
rule = map->rules[ruleno];
result_len = 0;
w = a;
o = b;
- /*
- * determine hierarchical context of force, if any. note
- * that this may or may not correspond to the specific types
- * referenced by the crush rule.
- */
- if (force >= 0 &&
- force < map->max_devices &&
- map->device_parents[force] != 0 &&
- !is_out(map, weight, force, x)) {
- while (1) {
- force_context[++force_pos] = force;
- if (force >= 0)
- force = map->device_parents[force];
- else
- force = map->bucket_parents[-1-force];
- if (force == 0)
- break;
- }
- }
-
for (step = 0; step < rule->len; step++) {
- firstn = 0;
- switch (rule->steps[step].op) {
+ int firstn = 0;
+ struct crush_rule_step *curstep = &rule->steps[step];
+
+ switch (curstep->op) {
case CRUSH_RULE_TAKE:
- w[0] = rule->steps[step].arg1;
+ w[0] = curstep->arg1;
+ wsize = 1;
+ break;
- /* find position in force_context/hierarchy */
- while (force_pos >= 0 &&
- force_context[force_pos] != w[0])
- force_pos--;
- /* and move past it */
- if (force_pos >= 0)
- force_pos--;
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
- wsize = 1;
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
break;
- case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
- case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ /* fall through */
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
case CRUSH_RULE_CHOOSE_INDEP:
- BUG_ON(wsize == 0);
+ if (wsize == 0)
+ break;
recurse_to_leaf =
- rule->steps[step].op ==
- CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
- rule->steps[step].op ==
- CRUSH_RULE_CHOOSE_LEAF_INDEP;
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
+ curstep->op ==
+ CRUSH_RULE_CHOOSELEAF_INDEP;
/* reset output */
osize = 0;
@@ -544,42 +738,60 @@ int crush_do_rule(struct crush_map *map,
* basically, numrep <= 0 means relative to
* the provided result_max
*/
- numrep = rule->steps[step].arg1;
+ numrep = curstep->arg1;
if (numrep <= 0) {
numrep += result_max;
if (numrep <= 0)
continue;
}
j = 0;
- if (osize == 0 && force_pos >= 0) {
- /* skip any intermediate types */
- while (force_pos &&
- force_context[force_pos] < 0 &&
- rule->steps[step].arg2 !=
- map->buckets[-1 -
- force_context[force_pos]]->type)
- force_pos--;
- o[osize] = force_context[force_pos];
- if (recurse_to_leaf)
- c[osize] = force_context[0];
- j++;
- force_pos--;
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ recurse_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
+ recurse_to_leaf,
+ vary_r,
+ c+osize,
+ 0);
+ } else {
+ crush_choose_indep(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0);
+ osize += numrep;
}
- osize += crush_choose(map,
- map->buckets[-1-w[i]],
- weight,
- x, numrep,
- rule->steps[step].arg2,
- o+osize, j,
- firstn,
- recurse_to_leaf, c+osize);
}
if (recurse_to_leaf)
/* copy final _leaf_ values to output set */
memcpy(o, c, osize*sizeof(*o));
- /* swap t and w arrays */
+ /* swap o and w arrays */
tmp = o;
o = w;
w = tmp;
@@ -596,7 +808,9 @@ int crush_do_rule(struct crush_map *map,
break;
default:
- BUG_ON(1);
+ dprintk(" unknown op %d at step %d\n",
+ curstep->op, step);
+ break;
}
}
return result_len;