aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig2
-rw-r--r--net/ceph/crush/mapper.c85
-rw-r--r--net/ceph/debugfs.c55
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/osd_client.c41
-rw-r--r--net/ceph/osdmap.c993
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/netclassid_cgroup.c15
-rw-r--r--net/core/netprio_cgroup.c41
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/tcp_memcontrol.c4
-rw-r--r--net/iucv/iucv.c127
-rw-r--r--net/sunrpc/backchannel_rqst.c93
-rw-r--r--net/sunrpc/clnt.c23
-rw-r--r--net/sunrpc/sched.c3
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c4
-rw-r--r--net/sunrpc/xprtrdma/transport.c10
-rw-r--r--net/sunrpc/xprtsock.c28
18 files changed, 1036 insertions, 504 deletions
diff --git a/net/Kconfig b/net/Kconfig
index d1f6f968fc0..d92afe4204d 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,7 +243,7 @@ config XPS
default y
config CGROUP_NET_PRIO
- tristate "Network priority cgroup"
+ bool "Network priority cgroup"
depends on CGROUPS
---help---
Cgroup subsystem for use in assigning processes to network priorities on
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b703790b4e4..a1ef53c0441 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -292,10 +292,12 @@ static int is_out(const struct crush_map *map,
* @outpos: our position in that vector
* @tries: number of attempts to make
* @recurse_tries: number of attempts to have recursive chooseleaf make
- * @local_tries: localized retries
- * @local_fallback_tries: localized fallback retries
+ * @local_retries: localized retries
+ * @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
*/
static int crush_choose_firstn(const struct crush_map *map,
struct crush_bucket *bucket,
@@ -304,10 +306,12 @@ static int crush_choose_firstn(const struct crush_map *map,
int *out, int outpos,
unsigned int tries,
unsigned int recurse_tries,
- unsigned int local_tries,
- unsigned int local_fallback_tries,
+ unsigned int local_retries,
+ unsigned int local_fallback_retries,
int recurse_to_leaf,
- int *out2)
+ unsigned int vary_r,
+ int *out2,
+ int parent_r)
{
int rep;
unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
int itemtype;
int collide, reject;
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
- bucket->id, x, outpos, numrep);
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep,
+ tries, recurse_tries, local_retries, local_fallback_retries,
+ parent_r);
for (rep = outpos; rep < numrep; rep++) {
/* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
do {
collide = 0;
retry_bucket = 0;
- r = rep;
+ r = rep + parent_r;
/* r' = r + f_total */
r += ftotal;
@@ -344,9 +351,9 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 1;
goto reject;
}
- if (local_fallback_tries > 0 &&
+ if (local_fallback_retries > 0 &&
flocal >= (in->size>>1) &&
- flocal > local_fallback_tries)
+ flocal > local_fallback_retries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
@@ -387,16 +394,23 @@ static int crush_choose_firstn(const struct crush_map *map,
reject = 0;
if (!collide && recurse_to_leaf) {
if (item < 0) {
+ int sub_r;
+ if (vary_r)
+ sub_r = r >> (vary_r-1);
+ else
+ sub_r = 0;
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
x, outpos+1, 0,
out2, outpos,
recurse_tries, 0,
- local_tries,
- local_fallback_tries,
+ local_retries,
+ local_fallback_retries,
0,
- NULL) <= outpos)
+ vary_r,
+ NULL,
+ sub_r) <= outpos)
/* didn't get leaf */
reject = 1;
} else {
@@ -420,14 +434,14 @@ reject:
ftotal++;
flocal++;
- if (collide && flocal <= local_tries)
+ if (collide && flocal <= local_retries)
/* retry locally a few times */
retry_bucket = 1;
- else if (local_fallback_tries > 0 &&
- flocal <= in->size + local_fallback_tries)
+ else if (local_fallback_retries > 0 &&
+ flocal <= in->size + local_fallback_retries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal <= tries)
+ else if (ftotal < tries)
/* then retry descent */
retry_descent = 1;
else
@@ -640,10 +654,20 @@ int crush_do_rule(const struct crush_map *map,
__u32 step;
int i, j;
int numrep;
- int choose_tries = map->choose_total_tries;
- int choose_local_tries = map->choose_local_tries;
- int choose_local_fallback_tries = map->choose_local_fallback_tries;
+ /*
+ * the original choose_total_tries value was off by one (it
+ * counted "retries" and not "tries"). add one.
+ */
+ int choose_tries = map->choose_total_tries + 1;
int choose_leaf_tries = 0;
+ /*
+ * the local tries values were counted as "retries", though,
+ * and need no adjustment
+ */
+ int choose_local_retries = map->choose_local_tries;
+ int choose_local_fallback_retries = map->choose_local_fallback_tries;
+
+ int vary_r = map->chooseleaf_vary_r;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -676,13 +700,18 @@ int crush_do_rule(const struct crush_map *map,
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
- if (curstep->arg1 > 0)
- choose_local_tries = curstep->arg1;
+ if (curstep->arg1 >= 0)
+ choose_local_retries = curstep->arg1;
break;
case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
- if (curstep->arg1 > 0)
- choose_local_fallback_tries = curstep->arg1;
+ if (curstep->arg1 >= 0)
+ choose_local_fallback_retries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+ if (curstep->arg1 >= 0)
+ vary_r = curstep->arg1;
break;
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
@@ -734,10 +763,12 @@ int crush_do_rule(const struct crush_map *map,
o+osize, j,
choose_tries,
recurse_tries,
- choose_local_tries,
- choose_local_fallback_tries,
+ choose_local_retries,
+ choose_local_fallback_retries,
recurse_to_leaf,
- c+osize);
+ vary_r,
+ c+osize,
+ 0);
} else {
crush_choose_indep(
map,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 258a382e75e..10421a4b76f 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p)
{
int i;
struct ceph_client *client = s->private;
+ struct ceph_osdmap *map = client->osdc.osdmap;
struct rb_node *n;
- if (client->osdc.osdmap == NULL)
+ if (map == NULL)
return 0;
- seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+
+ seq_printf(s, "epoch %d\n", map->epoch);
seq_printf(s, "flags%s%s\n",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
- " NEARFULL" : "",
- (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
- " FULL" : "");
- for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+ (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
+ (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
+
+ for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pool =
rb_entry(n, struct ceph_pg_pool_info, node);
- seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
- (unsigned long long)pool->id, pool->pg_num,
- pool->pg_num_mask);
+
+ seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
+ pool->id, pool->pg_num, pool->pg_num_mask,
+ pool->read_tier, pool->write_tier);
}
- for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
- struct ceph_entity_addr *addr =
- &client->osdc.osdmap->osd_addr[i];
- int state = client->osdc.osdmap->osd_state[i];
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+ int state = map->osd_state[i];
char sb[64];
- seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
i, ceph_pr_addr(&addr->in_addr),
- ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
- ceph_osdmap_state_str(sb, sizeof(sb), state));
+ ((map->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state),
+ ((ceph_get_primary_affinity(map, i)*100) >> 16));
+ }
+ for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+ pg->pgid.seed);
+ for (i = 0; i < pg->pg_temp.len; i++)
+ seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+ pg->pg_temp.osds[i]);
+ seq_printf(s, "]\n");
}
+ for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(n, struct ceph_pg_mapping, node);
+
+ seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+ pg->pgid.seed, pg->primary_temp.osd);
+ }
+
return 0;
}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30efc5c1862..4f55f9ce63f 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
if (!bytes || cursor->page_offset)
return false; /* more bytes to process in the current page */
+ if (!cursor->resid)
+ return false; /* no more data */
+
/* Move on to the next page; offset is already at 0 */
BUG_ON(cursor->page_index >= cursor->page_count);
@@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
if (!bytes || cursor->offset & ~PAGE_MASK)
return false; /* more bytes to process in the current page */
+ if (!cursor->resid)
+ return false; /* no more data */
+
/* Move on to the next page */
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 82750f91586..b0dfce77656 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
case CEPH_OSD_OP_OMAPCLEAR:
case CEPH_OSD_OP_OMAPRMKEYS:
case CEPH_OSD_OP_OMAP_CMP:
+ case CEPH_OSD_OP_SETALLOCHINT:
case CEPH_OSD_OP_CLONERANGE:
case CEPH_OSD_OP_ASSERT_SRC_VERSION:
case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_watch_init);
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+ unsigned int which,
+ u64 expected_object_size,
+ u64 expected_write_size)
+{
+ struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+ CEPH_OSD_OP_SETALLOCHINT);
+
+ op->alloc_hint.expected_object_size = expected_object_size;
+ op->alloc_hint.expected_write_size = expected_write_size;
+
+ /*
+ * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+ * not worth a feature bit. Set FAILOK per-op flag to make
+ * sure older osds don't trip over an unsupported opcode.
+ */
+ op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
+
static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
struct ceph_osd_data *osd_data)
{
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->watch.ver = cpu_to_le64(src->watch.ver);
dst->watch.flag = src->watch.flag;
break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ dst->alloc_hint.expected_object_size =
+ cpu_to_le64(src->alloc_hint.expected_object_size);
+ dst->alloc_hint.expected_write_size =
+ cpu_to_le64(src->alloc_hint.expected_write_size);
+ break;
default:
pr_err("unsupported osd opcode %s\n",
ceph_osd_op_name(src->op));
@@ -688,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
return 0;
}
+
dst->op = cpu_to_le16(src->op);
+ dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->payload_len);
return request_data_len;
@@ -1304,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc,
{
struct ceph_pg pgid;
int acting[CEPH_PG_MAX_SIZE];
- int o = -1, num = 0;
+ int num, o;
int err;
bool was_paused;
@@ -1317,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc,
}
req->r_pgid = pgid;
- err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
- if (err > 0) {
- o = acting[0];
- num = err;
- }
+ num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
+ if (num < 0)
+ num = 0;
was_paused = req->r_paused;
req->r_paused = __req_should_be_paused(osdc, req);
@@ -2033,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
int skipped_map = 0;
dout("taking full map %u len %d\n", epoch, maplen);
- newmap = osdmap_decode(&p, p+maplen);
+ newmap = ceph_osdmap_decode(&p, p+maplen);
if (IS_ERR(newmap)) {
err = PTR_ERR(newmap);
goto bad;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c0..e632b5a52f5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -343,7 +343,7 @@ bad:
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
*/
static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
{
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
kfree(pi);
}
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
{
u8 ev, cv;
unsigned len, num;
@@ -587,7 +587,7 @@ bad:
return -EINVAL;
}
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
{
struct ceph_pg_pool_info *pi;
u32 num, len;
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
rb_erase(&pg->node, &map->pg_temp);
kfree(pg);
}
+ while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->primary_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->primary_temp);
+ kfree(pg);
+ }
while (!RB_EMPTY_ROOT(&map->pg_pools)) {
struct ceph_pg_pool_info *pi =
rb_entry(rb_first(&map->pg_pools),
@@ -642,186 +649,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
kfree(map->osd_state);
kfree(map->osd_weight);
kfree(map->osd_addr);
+ kfree(map->osd_primary_affinity);
kfree(map);
}
/*
- * adjust max osd value. reallocate arrays.
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
*/
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
{
u8 *state;
- struct ceph_entity_addr *addr;
u32 *weight;
+ struct ceph_entity_addr *addr;
+ int i;
- state = kcalloc(max, sizeof(*state), GFP_NOFS);
- addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
- weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
- if (state == NULL || addr == NULL || weight == NULL) {
+ state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
+ weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
+ addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
+ if (!state || !weight || !addr) {
kfree(state);
- kfree(addr);
kfree(weight);
+ kfree(addr);
+
return -ENOMEM;
}
- /* copy old? */
- if (map->osd_state) {
- memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
- memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
- memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
- kfree(map->osd_state);
- kfree(map->osd_addr);
- kfree(map->osd_weight);
+ for (i = map->max_osd; i < max; i++) {
+ state[i] = 0;
+ weight[i] = CEPH_OSD_OUT;
+ memset(addr + i, 0, sizeof(*addr));
}
map->osd_state = state;
map->osd_weight = weight;
map->osd_addr = addr;
+
+ if (map->osd_primary_affinity) {
+ u32 *affinity;
+
+ affinity = krealloc(map->osd_primary_affinity,
+ max*sizeof(*affinity), GFP_NOFS);
+ if (!affinity)
+ return -ENOMEM;
+
+ for (i = map->max_osd; i < max; i++)
+ affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ map->osd_primary_affinity = affinity;
+ }
+
map->max_osd = max;
+
return 0;
}
+#define OSDMAP_WRAPPER_COMPAT_VER 7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
+
/*
- * decode a full map.
+ * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
*/
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
+static int get_osdmap_client_data_v(void **p, void *end,
+ const char *prefix, u8 *v)
{
- struct ceph_osdmap *map;
- u16 version;
- u32 len, max, i;
- int err = -EINVAL;
- void *start = *p;
- struct ceph_pg_pool_info *pi;
+ u8 struct_v;
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ if (struct_v >= 7) {
+ u8 struct_compat;
+
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+ pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+ struct_v, struct_compat,
+ OSDMAP_WRAPPER_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore wrapper struct_len */
+
+ ceph_decode_8_safe(p, end, struct_v, e_inval);
+ ceph_decode_8_safe(p, end, struct_compat, e_inval);
+ if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+ pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+ struct_v, struct_compat,
+ OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+ return -EINVAL;
+ }
+ *p += 4; /* ignore client data struct_len */
+ } else {
+ u16 version;
+
+ *p -= 1;
+ ceph_decode_16_safe(p, end, version, e_inval);
+ if (version < 6) {
+ pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+ prefix);
+ return -EINVAL;
+ }
- dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+ /* old osdmap enconding */
+ struct_v = 0;
+ }
- map = kzalloc(sizeof(*map), GFP_NOFS);
- if (map == NULL)
- return ERR_PTR(-ENOMEM);
- map->pg_temp = RB_ROOT;
+ *v = struct_v;
+ return 0;
- ceph_decode_16_safe(p, end, version, bad);
- if (version > 6) {
- pr_warning("got unknown v %d > 6 of osdmap\n", version);
- goto bad;
+e_inval:
+ return -EINVAL;
+}
+
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg_pool_info *pi;
+ u64 pool;
+ int ret;
+
+ ceph_decode_64_safe(p, end, pool, e_inval);
+
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!incremental || !pi) {
+ pi = kzalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ return -ENOMEM;
+
+ pi->id = pool;
+
+ ret = __insert_pg_pool(&map->pg_pools, pi);
+ if (ret) {
+ kfree(pi);
+ return ret;
+ }
+ }
+
+ ret = decode_pool(p, end, pi);
+ if (ret)
+ return ret;
}
- if (version < 6) {
- pr_warning("got old v %d < 6 of osdmap\n", version);
- goto bad;
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, false);
+}
+
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pools(p, end, map, true);
+}
+
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 len, i;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+
+ ret = __remove_pg_mapping(&map->pg_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || len > 0) {
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+
+ if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+ return -EINVAL;
+
+ pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->pg_temp.len = len;
+ for (i = 0; i < len; i++)
+ pg->pg_temp.osds[i] = ceph_decode_32(p);
+
+ ret = __insert_pg_mapping(pg, &map->pg_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
}
- ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, false);
+}
+
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_pg_temp(p, end, map, true);
+}
+
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+ bool incremental)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct ceph_pg pgid;
+ u32 osd;
+ int ret;
+
+ ret = ceph_decode_pgid(p, end, &pgid);
+ if (ret)
+ return ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+
+ ret = __remove_pg_mapping(&map->primary_temp, pgid);
+ BUG_ON(!incremental && ret != -ENOENT);
+
+ if (!incremental || osd != (u32)-1) {
+ struct ceph_pg_mapping *pg;
+
+ pg = kzalloc(sizeof(*pg), GFP_NOFS);
+ if (!pg)
+ return -ENOMEM;
+
+ pg->pgid = pgid;
+ pg->primary_temp.osd = osd;
+
+ ret = __insert_pg_mapping(pg, &map->primary_temp);
+ if (ret) {
+ kfree(pg);
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, false);
+}
+
+static int decode_new_primary_temp(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ return __decode_primary_temp(p, end, map, true);
+}
+
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity)
+ return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+
+ return map->osd_primary_affinity[osd];
+}
+
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+ BUG_ON(osd >= map->max_osd);
+
+ if (!map->osd_primary_affinity) {
+ int i;
+
+ map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+ GFP_NOFS);
+ if (!map->osd_primary_affinity)
+ return -ENOMEM;
+
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_primary_affinity[i] =
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+ }
+
+ map->osd_primary_affinity[osd] = aff;
+
+ return 0;
+}
+
+static int decode_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 len, i;
+
+ ceph_decode_32_safe(p, end, len, e_inval);
+ if (len == 0) {
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
+ return 0;
+ }
+ if (len != map->max_osd)
+ goto e_inval;
+
+ ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+
+ for (i = 0; i < map->max_osd; i++) {
+ int ret;
+
+ ret = set_primary_affinity(map, i, ceph_decode_32(p));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int decode_new_primary_affinity(void **p, void *end,
+ struct ceph_osdmap *map)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ u32 osd, aff;
+ int ret;
+
+ ceph_decode_32_safe(p, end, osd, e_inval);
+ ceph_decode_32_safe(p, end, aff, e_inval);
+
+ ret = set_primary_affinity(map, osd, aff);
+ if (ret)
+ return ret;
+
+ pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+/*
+ * decode a full map.
+ */
+static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+{
+ u8 struct_v;
+ u32 epoch = 0;
+ void *start = *p;
+ u32 max;
+ u32 len, i;
+ int err;
+
+ dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+
+ err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+ if (err)
+ goto bad;
+
+ /* fsid, epoch, created, modified */
+ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+ sizeof(map->created) + sizeof(map->modified), e_inval);
ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
- map->epoch = ceph_decode_32(p);
+ epoch = map->epoch = ceph_decode_32(p);
ceph_decode_copy(p, &map->created, sizeof(map->created));
ceph_decode_copy(p, &map->modified, sizeof(map->modified));
- ceph_decode_32_safe(p, end, max, bad);
- while (max--) {
- ceph_decode_need(p, end, 8 + 2, bad);
- err = -ENOMEM;
- pi = kzalloc(sizeof(*pi), GFP_NOFS);
- if (!pi)
- goto bad;
- pi->id = ceph_decode_64(p);
- err = __decode_pool(p, end, pi);
- if (err < 0) {
- kfree(pi);
- goto bad;
- }
- __insert_pg_pool(&map->pg_pools, pi);
- }
+ /* pools */
+ err = decode_pools(p, end, map);
+ if (err)
+ goto bad;
- err = __decode_pool_names(p, end, map);
- if (err < 0) {
- dout("fail to decode pool names");
+ /* pool_name */
+ err = decode_pool_names(p, end, map);
+ if (err)
goto bad;
- }
- ceph_decode_32_safe(p, end, map->pool_max, bad);
+ ceph_decode_32_safe(p, end, map->pool_max, e_inval);
- ceph_decode_32_safe(p, end, map->flags, bad);
+ ceph_decode_32_safe(p, end, map->flags, e_inval);
- max = ceph_decode_32(p);
+ /* max_osd */
+ ceph_decode_32_safe(p, end, max, e_inval);
/* (re)alloc osd arrays */
err = osdmap_set_max_osd(map, max);
- if (err < 0)
+ if (err)
goto bad;
- dout("osdmap_decode max_osd = %d\n", map->max_osd);
- /* osds */
- err = -EINVAL;
+ /* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*(1 + sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), bad);
- *p += 4; /* skip length field (should match max) */
+ sizeof(*map->osd_addr)), e_inval);
+
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
ceph_decode_copy(p, map->osd_state, map->max_osd);
- *p += 4; /* skip length field (should match max) */
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
for (i = 0; i < map->max_osd; i++)
map->osd_weight[i] = ceph_decode_32(p);
- *p += 4; /* skip length field (should match max) */
+ if (ceph_decode_32(p) != map->max_osd)
+ goto e_inval;
+
ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
for (i = 0; i < map->max_osd; i++)
ceph_decode_addr(&map->osd_addr[i]);
/* pg_temp */
- ceph_decode_32_safe(p, end, len, bad);
- for (i = 0; i < len; i++) {
- int n, j;
- struct ceph_pg pgid;
- struct ceph_pg_mapping *pg;
+ err = decode_pg_temp(p, end, map);
+ if (err)
+ goto bad;
- err = ceph_decode_pgid(p, end, &pgid);
+ /* primary_temp */
+ if (struct_v >= 1) {
+ err = decode_primary_temp(p, end, map);
if (err)
goto bad;
- ceph_decode_need(p, end, sizeof(u32), bad);
- n = ceph_decode_32(p);
- err = -EINVAL;
- if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
- goto bad;
- ceph_decode_need(p, end, n * sizeof(u32), bad);
- err = -ENOMEM;
- pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
- if (!pg)
- goto bad;
- pg->pgid = pgid;
- pg->len = n;
- for (j = 0; j < n; j++)
- pg->osds[j] = ceph_decode_32(p);
+ }
- err = __insert_pg_mapping(pg, &map->pg_temp);
+ /* primary_affinity */
+ if (struct_v >= 2) {
+ err = decode_primary_affinity(p, end, map);
if (err)
goto bad;
- dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
- len);
+ } else {
+ /* XXX can this happen? */
+ kfree(map->osd_primary_affinity);
+ map->osd_primary_affinity = NULL;
}
/* crush */
- ceph_decode_32_safe(p, end, len, bad);
- dout("osdmap_decode crush len %d from off 0x%x\n", len,
- (int)(*p - start));
- ceph_decode_need(p, end, len, bad);
- map->crush = crush_decode(*p, end);
- *p += len;
+ ceph_decode_32_safe(p, end, len, e_inval);
+ map->crush = crush_decode(*p, min(*p + len, end));
if (IS_ERR(map->crush)) {
err = PTR_ERR(map->crush);
map->crush = NULL;
goto bad;
}
+ *p += len;
- /* ignore the rest of the map */
+ /* ignore the rest */
*p = end;
- dout("osdmap_decode done %p %p\n", *p, end);
- return map;
+ dout("full osdmap epoch %d max_osd %d\n",