diff options
author | Sage Weil <sage@newdream.net> | 2009-10-06 11:31:12 -0700 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2009-10-06 11:31:12 -0700 |
commit | a8599bd821d084d04a3290fffae1071624ec00ea (patch) | |
tree | e2323ebfad9a49fdb579ff87d1ec3a8694e8c0f5 /fs | |
parent | ba75bb98cfb93b62c54af25bf67ff90857264bbe (diff) |
ceph: capability management
The Ceph metadata servers control client access to inode metadata and
file data by issuing capabilities, granting clients permission to read
and/or write both inode field and file data to OSDs (storage nodes).
Each capability consists of a set of bits indicating which operations
are allowed.
If the client holds a *_SHARED cap, the client has a coherent value
that can be safely read from the cached inode.
In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the client
is allowed to change inode attributes (e.g., file size, mtime), note
its dirty state in the ceph_cap, and asynchronously flush that
metadata change to the MDS.
In the event of a conflicting operation (perhaps by another client),
the MDS will revoke the conflicting client capabilities.
In order for a client to cache an inode, it must hold a capability
with at least one MDS server. When inodes are released, release
notifications are batched and periodically sent en masse to the MDS
cluster to release server state.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/caps.c | 2830 |
1 files changed, 2830 insertions, 0 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c new file mode 100644 index 00000000000..5c7d0e9bbb7 --- /dev/null +++ b/fs/ceph/caps.c @@ -0,0 +1,2830 @@ +#include "ceph_debug.h" + +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> + +#include "super.h" +#include "decode.h" +#include "messenger.h" + +/* + * Capability management + * + * The Ceph metadata servers control client access to inode metadata + * and file data by issuing capabilities, granting clients permission + * to read and/or write both inode field and file data to OSDs + * (storage nodes). Each capability consists of a set of bits + * indicating which operations are allowed. + * + * If the client holds a *_SHARED cap, the client has a coherent value + * that can be safely read from the cached inode. + * + * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the + * client is allowed to change inode attributes (e.g., file size, + * mtime), note its dirty state in the ceph_cap, and asynchronously + * flush that metadata change to the MDS. + * + * In the event of a conflicting operation (perhaps by another + * client), the MDS will revoke the conflicting client capabilities. + * + * In order for a client to cache an inode, it must hold a capability + * with at least one MDS server. When inodes are released, release + * notifications are batched and periodically sent en masse to the MDS + * cluster to release server state. + */ + + +/* + * Generate readable cap strings for debugging output. + */ +#define MAX_CAP_STR 20 +static char cap_str[MAX_CAP_STR][40]; +static DEFINE_SPINLOCK(cap_str_lock); +static int last_cap_str; + +static char *gcap_string(char *s, int c) +{ + if (c & CEPH_CAP_GSHARED) + *s++ = 's'; + if (c & CEPH_CAP_GEXCL) + *s++ = 'x'; + if (c & CEPH_CAP_GCACHE) + *s++ = 'c'; + if (c & CEPH_CAP_GRD) + *s++ = 'r'; + if (c & CEPH_CAP_GWR) + *s++ = 'w'; + if (c & CEPH_CAP_GBUFFER) + *s++ = 'b'; + if (c & CEPH_CAP_GLAZYIO) + *s++ = 'l'; + return s; +} + +const char *ceph_cap_string(int caps) +{ + int i; + char *s; + int c; + + spin_lock(&cap_str_lock); + i = last_cap_str++; + if (last_cap_str == MAX_CAP_STR) + last_cap_str = 0; + spin_unlock(&cap_str_lock); + + s = cap_str[i]; + + if (caps & CEPH_CAP_PIN) + *s++ = 'p'; + + c = (caps >> CEPH_CAP_SAUTH) & 3; + if (c) { + *s++ = 'A'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SLINK) & 3; + if (c) { + *s++ = 'L'; + s = gcap_string(s, c); + } + + c = (caps >> CEPH_CAP_SXATTR) & 3; + if (c) { + *s++ = 'X'; + s = gcap_string(s, c); + } + + c = caps >> CEPH_CAP_SFILE; + if (c) { + *s++ = 'F'; + s = gcap_string(s, c); + } + + if (s == cap_str[i]) + *s++ = '-'; + *s = 0; + return cap_str[i]; +} + +/* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ +static spinlock_t caps_list_lock; +static struct list_head caps_list; /* unused (reserved or unreserved) */ +static int caps_total_count; /* total caps allocated */ +static int caps_use_count; /* in use */ +static int caps_reserve_count; /* unused, reserved */ +static int caps_avail_count; /* unused, unreserved */ + +void __init ceph_caps_init(void) +{ + INIT_LIST_HEAD(&caps_list); + spin_lock_init(&caps_list_lock); +} + +void ceph_caps_finalize(void) +{ + struct ceph_cap *cap; + + spin_lock(&caps_list_lock); + while (!list_empty(&caps_list)) { + cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + caps_total_count = 0; + caps_avail_count = 0; + caps_use_count = 0; + caps_reserve_count = 0; + spin_unlock(&caps_list_lock); +} + +int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) +{ + int i; + struct ceph_cap *cap; + int have; + int alloc = 0; + LIST_HEAD(newcaps); + int ret = 0; + + dout("reserve caps ctx=%p need=%d\n", ctx, need); + + /* first reserve any caps that are already allocated */ + spin_lock(&caps_list_lock); + if (caps_avail_count >= need) + have = need; + else + have = caps_avail_count; + caps_avail_count -= have; + caps_reserve_count += have; + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + + for (i = have; i < need; i++) { + cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + if (!cap) { + ret = -ENOMEM; + goto out_alloc_count; + } + list_add(&cap->caps_item, &newcaps); + alloc++; + } + BUG_ON(have + alloc != need); + + spin_lock(&caps_list_lock); + caps_total_count += alloc; + caps_reserve_count += alloc; + list_splice(&newcaps, &caps_list); + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + + ctx->count = need; + dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", + ctx, caps_total_count, caps_use_count, caps_reserve_count, + caps_avail_count); + return 0; + +out_alloc_count: + /* we didn't manage to reserve as much as we needed */ + pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have); + return ret; +} + +int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) +{ + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); + if (ctx->count) { + spin_lock(&caps_list_lock); + BUG_ON(caps_reserve_count < ctx->count); + caps_reserve_count -= ctx->count; + caps_avail_count += ctx->count; + ctx->count = 0; + dout("unreserve caps %d = %d used + %d resv + %d avail\n", + caps_total_count, caps_use_count, caps_reserve_count, + caps_avail_count); + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + } + return 0; +} + +static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) +{ + struct ceph_cap *cap = NULL; + + /* temporary, until we do something about cap import/export */ + if (!ctx) + return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); + + spin_lock(&caps_list_lock); + dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx->count, caps_total_count, caps_use_count, + caps_reserve_count, caps_avail_count); + BUG_ON(!ctx->count); + BUG_ON(ctx->count > caps_reserve_count); + BUG_ON(list_empty(&caps_list)); + + ctx->count--; + caps_reserve_count--; + caps_use_count++; + + cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + list_del(&cap->caps_item); + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); + return cap; +} + +static void put_cap(struct ceph_cap *cap, + struct ceph_cap_reservation *ctx) +{ + spin_lock(&caps_list_lock); + dout("put_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", + ctx, ctx ? ctx->count : 0, caps_total_count, caps_use_count, + caps_reserve_count, caps_avail_count); + caps_use_count--; + /* + * Keep some preallocated caps around, at least enough to do a + * readdir (which needs to preallocate lots of them), to avoid + * lots of free/alloc churn. + */ + if (caps_avail_count >= caps_reserve_count + + ceph_client(cap->ci->vfs_inode.i_sb)->mount_args.max_readdir) { + caps_total_count--; + kmem_cache_free(ceph_cap_cachep, cap); + } else { + if (ctx) { + ctx->count++; + caps_reserve_count++; + } else { + caps_avail_count++; + } + list_add(&cap->caps_item, &caps_list); + } + + BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + + caps_avail_count); + spin_unlock(&caps_list_lock); +} + +void ceph_reservation_status(struct ceph_client *client, + int *total, int *avail, int *used, int *reserved) +{ + if (total) + *total = caps_total_count; + if (avail) + *avail = caps_avail_count; + if (used) + *used = caps_use_count; + if (reserved) + *reserved = caps_reserve_count; +} + +/* + * Find ceph_cap for given mds, if any. + * + * Called with i_lock held. + */ +static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + struct rb_node *n = ci->i_caps.rb_node; + + while (n) { + cap = rb_entry(n, struct ceph_cap, ci_node); + if (mds < cap->mds) + n = n->rb_left; + else if (mds > cap->mds) + n = n->rb_right; + else + return cap; + } + return NULL; +} + +/* + * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else + * -1. + */ +static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) +{ + struct ceph_cap *cap; + int mds = -1; + struct rb_node *p; + + /* prefer mds with WR|WRBUFFER|EXCL caps */ + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + mds = cap->mds; + if (mseq) + *mseq = cap->mseq; + if (cap->issued & (CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_BUFFER | + CEPH_CAP_FILE_EXCL)) + break; + } + return mds; +} + +int ceph_get_cap_mds(struct inode *inode) +{ + int mds; + spin_lock(&inode->i_lock); + mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); + spin_unlock(&inode->i_lock); + return mds; +} + +/* + * Called under i_lock. + */ +static void __insert_cap_node(struct ceph_inode_info *ci, + struct ceph_cap *new) +{ + struct rb_node **p = &ci->i_caps.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap *cap = NULL; + + while (*p) { + parent = *p; + cap = rb_entry(parent, struct ceph_cap, ci_node); + if (new->mds < cap->mds) + p = &(*p)->rb_left; + else if (new->mds > cap->mds) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->ci_node, parent, p); + rb_insert_color(&new->ci_node, &ci->i_caps); +} + +/* + * (re)set cap hold timeouts, which control the delayed release + * of unused caps back to the MDS. Should be called on cap use. + */ +static void __cap_set_timeouts(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + struct ceph_mount_args *ma = &mdsc->client->mount_args; + + ci->i_hold_caps_min = round_jiffies(jiffies + + ma->caps_wanted_delay_min * HZ); + ci->i_hold_caps_max = round_jiffies(jiffies + + ma->caps_wanted_delay_max * HZ); + dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, + ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); +} + +/* + * (Re)queue cap at the end of the delayed cap release list. + * + * If I_FLUSH is set, leave the inode at the front of the list. + * + * Caller holds i_lock + * -> we take mdsc->cap_delay_lock + */ +static void __cap_delay_requeue(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + __cap_set_timeouts(mdsc, ci); + dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + ci->i_ceph_flags, ci->i_hold_caps_max); + if (!mdsc->stopping) { + spin_lock(&mdsc->cap_delay_lock); + if (!list_empty(&ci->i_cap_delay_list)) { + if (ci->i_ceph_flags & CEPH_I_FLUSH) + goto no_change; + list_del_init(&ci->i_cap_delay_list); + } + list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); +no_change: + spin_unlock(&mdsc->cap_delay_lock); + } +} + +/* + * Queue an inode for immediate writeback. Mark inode with I_FLUSH, + * indicating we should send a cap message to flush dirty metadata + * asap, and move to the front of the delayed cap list. + */ +static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode); + spin_lock(&mdsc->cap_delay_lock); + ci->i_ceph_flags |= CEPH_I_FLUSH; + if (!list_empty(&ci->i_cap_delay_list)) + list_del_init(&ci->i_cap_delay_list); + list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Cancel delayed work on cap. + * + * Caller must hold i_lock. + */ +static void __cap_delay_cancel(struct ceph_mds_client *mdsc, + struct ceph_inode_info *ci) +{ + dout("__cap_delay_cancel %p\n", &ci->vfs_inode); + if (list_empty(&ci->i_cap_delay_list)) + return; + spin_lock(&mdsc->cap_delay_lock); + list_del_init(&ci->i_cap_delay_list); + spin_unlock(&mdsc->cap_delay_lock); +} + +/* + * Common issue checks for add_cap, handle_cap_grant. + */ +static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, + unsigned issued) +{ + unsigned had = __ceph_caps_issued(ci, NULL); + + /* + * Each time we receive FILE_CACHE anew, we increment + * i_rdcache_gen. + */ + if ((issued & CEPH_CAP_FILE_CACHE) && + (had & CEPH_CAP_FILE_CACHE) == 0) + ci->i_rdcache_gen++; + + /* + * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * don't know what happened to this directory while we didn't + * have the cap. + */ + if ((issued & CEPH_CAP_FILE_SHARED) && + (had & CEPH_CAP_FILE_SHARED) == 0) { + ci->i_shared_gen++; + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + } + } +} + +/* + * Add a capability under the given MDS session. + * + * Caller should hold session snap_rwsem (read) and s_mutex. + * + * @fmode is the open file mode, if we are opening a file, otherwise + * it is < 0. (This is so we can atomically add the cap and add an + * open file reference to it.) + */ +int ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap_reservation *caps_reservation) +{ + struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *new_cap = NULL; + struct ceph_cap *cap; + int mds = session->s_mds; + int actual_wanted; + + dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, + session->s_mds, cap_id, ceph_cap_string(issued), seq); + + /* + * If we are opening the file, include file mode wanted bits + * in wanted. + */ + if (fmode >= 0) + wanted |= ceph_caps_for_mode(fmode); + +retry: + spin_lock(&inode->i_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (new_cap) { + cap = new_cap; + new_cap = NULL; + } else { + spin_unlock(&inode->i_lock); + new_cap = get_cap(caps_reservation); + if (new_cap == NULL) + return -ENOMEM; + goto retry; + } + + cap->issued = 0; + cap->implemented = 0; + cap->mds = mds; + cap->mds_wanted = 0; + + cap->ci = ci; + __insert_cap_node(ci, cap); + + /* clear out old exporting info? (i.e. on cap import) */ + if (ci->i_cap_exporting_mds == mds) { + ci->i_cap_exporting_issued = 0; + ci->i_cap_exporting_mseq = 0; + ci->i_cap_exporting_mds = -1; + } + + /* add to session cap list */ + cap->session = session; + spin_lock(&session->s_cap_lock); + list_add_tail(&cap->session_caps, &session->s_caps); + session->s_nr_caps++; + spin_unlock(&session->s_cap_lock); + } + + if (!ci->i_snap_realm) { + /* + * add this inode to the appropriate snap realm + */ + struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, + realmino); + if (realm) { + ceph_get_snap_realm(mdsc, realm); + spin_lock(&realm->inodes_with_caps_lock); + ci->i_snap_realm = realm; + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + spin_unlock(&realm->inodes_with_caps_lock); + } else { + pr_err("ceph_add_cap: couldn't find snap realm %llx\n", + realmino); + } + } + + __check_cap_issue(ci, cap, issued); + + /* + * If we are issued caps we don't want, or the mds' wanted + * value appears to be off, queue a check so we'll release + * later and/or update the mds wanted value. + */ + actual_wanted = __ceph_caps_wanted(ci); + if ((wanted & ~actual_wanted) || + (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { + dout(" issued %s, mds wanted %s, actual %s, queueing\n", + ceph_cap_string(issued), ceph_cap_string(wanted), + ceph_cap_string(actual_wanted)); + __cap_delay_requeue(mdsc, ci); + } + + if (flags & CEPH_CAP_FLAG_AUTH) + ci->i_auth_cap = cap; + else if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", + inode, ceph_vinop(inode), cap, ceph_cap_string(issued), + ceph_cap_string(issued|cap->issued), seq, mds); + cap->cap_id = cap_id; + cap->issued = issued; + cap->implemented |= issued; + cap->mds_wanted |= wanted; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->gen = session->s_cap_gen; + + if (fmode >= 0) + __ceph_get_fmode(ci, fmode); + spin_unlock(&inode->i_lock); + wake_up(&ci->i_cap_wq); + return 0; +} + +/* + * Return true if cap has not timed out and belongs to the current + * generation of the MDS session (i.e. has not gone 'stale' due to + * us losing touch with the mds). + */ +static int __cap_is_valid(struct ceph_cap *cap) +{ + unsigned long ttl; + u32 gen; + + spin_lock(&cap->session->s_cap_lock); + gen = cap->session->s_cap_gen; + ttl = cap->session->s_cap_ttl; + spin_unlock(&cap->session->s_cap_lock); + + if (cap->gen < gen || time_after_eq(jiffies, ttl)) { + dout("__cap_is_valid %p cap %p issued %s " + "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode, + cap, ceph_cap_string(cap->issued), cap->gen, gen); + return 0; + } + + return 1; +} + +/* + * Return set of valid cap bits issued to us. Note that caps time + * out, and may be invalidated in bulk if the client session times out + * and session->s_cap_gen is bumped. + */ +int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + if (implemented) + *implemented = 0; + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + dout("__ceph_caps_issued %p cap %p issued %s\n", + &ci->vfs_inode, cap, ceph_cap_string(cap->issued)); + have |= cap->issued; + if (implemented) + *implemented |= cap->implemented; + } + return have; +} + +/* + * Get cap bits issued by caps other than @ocap + */ +int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) +{ + int have = ci->i_snap_caps; + struct ceph_cap *cap; + struct rb_node *p; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (cap == ocap) + continue; + if (!__cap_is_valid(cap)) + continue; + have |= cap->issued; + } + return have; +} + +/* + * Move a cap to the end of the LRU (oldest caps at list head, newest + * at list tail). + */ +static void __touch_cap(struct ceph_cap *cap) +{ + struct ceph_mds_session *s = cap->session; + + dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, + s->s_mds); + spin_lock(&s->s_cap_lock); + list_move_tail(&cap->session_caps, &s->s_caps); + spin_unlock(&s->s_cap_lock); +} + +/* + * Check if we hold the given mask. If so, move the cap(s) to the + * front of their respective LRUs. (This is the preferred way for + * callers to check for caps they want.) + */ +int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) +{ + struct ceph_cap *cap; + struct rb_node *p; + int have = ci->i_snap_caps; + + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p snap issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(have), + ceph_cap_string(mask)); + return 1; + } + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + if ((cap->issued & mask) == mask) { + dout("__ceph_caps_issued_mask %p cap %p issued %s" + " (mask %s)\n", &ci->vfs_inode, cap, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) + __touch_cap(cap); + return 1; + } + + /* does a combination of caps satisfy mask? */ + have |= cap->issued; + if ((have & mask) == mask) { + dout("__ceph_caps_issued_mask %p combo issued %s" + " (mask %s)\n", &ci->vfs_inode, + ceph_cap_string(cap->issued), + ceph_cap_string(mask)); + if (touch) { + struct rb_node *q; + + /* touch this + preceeding caps */ + __touch_cap(cap); + for (q = rb_first(&ci->i_caps); q != p; + q = rb_next(q)) { + cap = rb_entry(q, struct ceph_cap, + ci_node); + if (!__cap_is_valid(cap)) + continue; + __touch_cap(cap); + } + } + return 1; + } + } + + return 0; +} + +/* + * Return true if mask caps are currently being revoked by an MDS. + */ +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct rb_node *p; + int ret = 0; + + spin_lock(&inode->i_lock); + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (__cap_is_valid(cap) && + (cap->implemented & ~cap->issued & mask)) { + ret = 1; + break; + } + } + spin_unlock(&inode->i_lock); + dout("ceph_caps_revoking %p %s = %d\n", inode, + ceph_cap_string(mask), ret); + return ret; +} + +int __ceph_caps_used(struct ceph_inode_info *ci) +{ + int used = 0; + if (ci->i_pin_ref) + used |= CEPH_CAP_PIN; + if (ci->i_rd_ref) + used |= CEPH_CAP_FILE_RD; + if (ci->i_rdcache_ref || ci->i_rdcache_gen) + used |= CEPH_CAP_FILE_CACHE; + if (ci->i_wr_ref) + used |= CEPH_CAP_FILE_WR; + if (ci->i_wrbuffer_ref) + used |= CEPH_CAP_FILE_BUFFER; + return used; +} + +/* + * wanted, by virtue of open file modes + */ +int __ceph_caps_file_wanted(struct ceph_inode_info *ci) +{ + int want = 0; + int mode; + for (mode = 0; mode < 4; mode++) + if (ci->i_nr_by_mode[mode]) + want |= ceph_caps_for_mode(mode); + return want; +} + +/* + * Return caps we have registered with the MDS(s) as 'wanted'. + */ +int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) +{ + struct ceph_cap *cap; + struct rb_node *p; + int mds_wanted = 0; + + for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { + cap = rb_entry(p, struct ceph_cap, ci_node); + if (!__cap_is_valid(cap)) + continue; + mds_wanted |= cap->mds_wanted; + } + return mds_wanted; +} + +/* + * called under i_lock + */ +static int __ceph_is_any_caps(struct ceph_inode_info *ci) +{ + return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; +} + +/* + * caller should hold i_lock, and session s_mutex. + * returns true if this is the last cap. if so, caller should iput. + */ +void __ceph_remove_cap(struct ceph_cap *cap, + struct ceph_cap_reservation *ctx) +{ + struct ceph_mds_session *session = cap->session; + struct ceph_inode_info *ci = cap->ci; + struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + + /* remove from session list */ + spin_lock(&session->s_cap_lock); + list_del_init(&cap->session_caps); + session->s_nr_caps--; + spin_unlock(&session->s_cap_lock); + + /* remove from inode list */ + rb_erase(&cap->ci_node, &ci->i_caps); + cap->session = NULL; + if (ci->i_auth_cap == cap) + ci->i_auth_cap = NULL; + + put_cap(cap, ctx); + + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } + if (!__ceph_is_any_real_caps(ci)) + __cap_delay_cancel(mdsc, ci); +} + +/* + * Build and send a cap message to the given MDS. + * + * Caller should be holding s_mutex. + */ +static int send_cap_msg(struct ceph_mds_session *session, + u64 ino, u64 cid, int op, + int caps, int wanted, int dirty, + u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, + u64 size, u64 max_size, + struct timespec *mtime, struct timespec *atime, + u64 time_warp_seq, + uid_t uid, gid_t gid, mode_t mode, + u64 xattr_version, + struct ceph_buffer *xattrs_buf, + u64 follows) +{ + struct ceph_mds_caps *fc; + struct ceph_msg *msg; + + dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" + " seq %u/%u mseq %u follows %lld size %llu/%llu" + " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), + cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), + ceph_cap_string(dirty), + seq, issue_seq, mseq, follows, size, max_size, + xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); + if (IS_ERR(msg)) + return PTR_ERR(msg); + + fc = msg->front.iov_base; + + memset(fc, 0, sizeof(*fc)); + + fc->cap_id = cpu_to_le64(cid); + fc->op = cpu_to_le32(op); + fc->seq = cpu_to_le32(seq); + fc->client_tid = cpu_to_le64(flush_tid); + fc->issue_seq = cpu_to_le32(issue_seq); + fc->migrate_seq = cpu_to_le32(mseq); + fc->caps = cpu_to_le32(caps); + fc->wanted = cpu_to_le32(wanted); + fc->dirty = cpu_to_le32(dirty); + fc->ino = cpu_to_le64(ino); + fc->snap_follows = cpu_to_le64(follows); + + fc->size = cpu_to_le64(size); + fc->max_size = cpu_to_le64(max_size); + if (mtime) + ceph_encode_timespec(&fc->mtime, mtime); + if (atime) + ceph_encode_timespec(&fc->atime, atime); + fc->time_warp_seq = cpu_to_le32(time_warp_seq); + + fc->uid = cpu_to_le32(uid); + fc->gid = cpu_to_le32(gid); + fc->mode = cpu_to_le32(mode); + + fc->xattr_version = cpu_to_le64(xattr_version); + if (xattrs_buf) { + msg->middle = ceph_buffer_get(xattrs_buf); + fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); + } + + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Queue cap releases when an inode is dropped from our + * cache. + */ +void ceph_queue_caps_release(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + + spin_lock(&inode->i_lock); + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + struct ceph_mds_session *session = cap->session; + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + + spin_lock(&session->s_cap_lock); + BUG_ON(!session->s_num_cap_releases); + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + + dout(" adding %p release to mds%d msg %p (%d left)\n", + inode, session->s_mds, msg, session->s_num_cap_releases); + + BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); + head = msg->front.iov_base; + head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(ceph_ino(inode)); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + + session->s_num_cap_releases--; + + msg->front.iov_len += sizeof(*item); + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + dout(" release msg %p full\n", msg); + list_move_tail(&msg->list_head, + &session->s_cap_releases_done); + } else { + dout(" release msg %p at %d/%d (%d)\n", msg, + (int)le32_to_cpu(head->num), + (int)CEPH_CAPS_PER_RELEASE, + (int)msg->front.iov_len); + } + spin_unlock(&session->s_cap_lock); + p = rb_next(p); + __ceph_remove_cap(cap, NULL); + + } + spin_unlock(&inode->i_lock); +} + +/* + * Send a cap msg on the given inode. Update our caps state, then + * drop i_lock and send the message. + * + * Make note of max_size reported/requested from mds, revoked caps + * that have now been implemented. + * + * Make half-hearted attempt ot to invalidate page cache if we are + * dropping RDCACHE. Note that this will leave behind locked pages + * that we'll then need to deal with elsewhere. + * + * Return non-zero if delayed release, or we experienced an error + * such that the caller should requeue + retry later. + * + * called with i_lock, then drops it. + * caller should hold snap_rwsem (read), s_mutex. + */ +static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, + int op, int used, int want, int retain, int flushing, + unsigned *pflush_tid) + __releases(cap->ci->vfs_inode->i_lock) +{ + struct ceph_inode_info *ci = cap->ci; + struct inode *inode = &ci->vfs_inode; + u64 cap_id = cap->cap_id; + int held = cap->issued | cap->implemented; + int revoking = cap->implemented & ~cap->issued; + int dropping = cap->issued & ~retain; + int keep; + u64 seq, issue_seq, mseq, time_warp_seq, follows; + u64 size, max_size; + struct timespec mtime, atime; + int wake = 0; + mode_t mode; + uid_t uid; + gid_t gid; + struct ceph_mds_session *session; + u64 xattr_version = 0; + int delayed = 0; + u64 flush_tid = 0; + int i; + int ret; + + dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", + inode, cap, cap->session, + ceph_cap_string(held), ceph_cap_string(held & retain), + ceph_cap_string(revoking)); + BUG_ON((retain & CEPH_CAP_PIN) == 0); + + session = cap->session; + + /* don't release wanted unless we've waited a bit. */ + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && + time_before(jiffies, ci->i_hold_caps_min)) { + dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", + ceph_cap_string(cap->issued), + ceph_cap_string(cap->issued & retain), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(want)); + want |= cap->mds_wanted; + retain |= cap->issued; + delayed = 1; + } + ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); + + cap->issued &= retain; /* drop bits we don't want */ + if (cap->implemented & ~cap->issued) { + /* + * Wake up any waiters on wanted -> needed transition. + * This is due to the weird transition from buffered + * to sync IO... we need to flush dirty pages _before_ + * allowing sync writes to avoid reordering. + */ + wake = 1; + } + cap->implemented &= cap->issued | used; + cap->mds_wanted = want; + + if (flushing) { + /* + * assign a tid for flush operations so we can avoid + * flush1 -> dirty1 -> flush2 -> flushack1 -> mark + * clean type races. track latest tid for every bit + * so we can handle flush AxFw, flush Fw, and have the + * first ack clean Ax. + */ + flush_tid = ++ci->i_cap_flush_last_tid; + if (pflush_tid) + *pflush_tid = flush_tid; + dout(" cap_flush_tid %d\n", (int)flush_tid); + for (i = 0; i < CEPH_CAP_BITS; i++) + if (flushing & (1 << i)) + ci->i_cap_flush_tid[i] = flush_tid; + } + + keep = cap->implemented; + seq = cap->seq; + issue_seq = cap->issue_seq; + mseq = cap->mseq; + size = inode->i_size; + ci->i_reported_size = size; + max_size = ci->i_wanted_max_size; + ci->i_requested_max_size = max_size; + mtime = inode->i_mtime; + atime = inode->i_atime; + time_warp_seq = ci->i_time_warp_seq; + follows = ci->i_snap_realm->cached_context->seq; + uid = inode->i_uid; + gid = inode->i_gid; + mode = inode->i_mode; + + if (dropping & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + xattr_version = ci->i_xattrs.version + 1; + } + + spin_unlock(&inode->i_lock); + + if (dropping & CEPH_CAP_FILE_CACHE) { + /* invalidate what we can */ + dout("invalidating pages on %p\n", inode); + invali |