diff options
Diffstat (limited to 'fs/ceph/caps.c')
| -rw-r--r-- | fs/ceph/caps.c | 1065 |
1 files changed, 671 insertions, 394 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e2b71..1fde164b74b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -147,7 +148,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +156,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +174,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +198,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -227,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc, return 0; } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -236,8 +230,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { + spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } @@ -309,7 +305,7 @@ void ceph_reservation_status(struct ceph_fs_client *fsc, /* * Find ceph_cap for given mds, if any. * - * Called with i_lock held. + * Called with i_ceph_lock held. */ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { @@ -332,9 +328,9 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return cap; } @@ -361,15 +357,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci) int ceph_get_cap_mds(struct inode *inode) { + struct ceph_inode_info *ci = ceph_inode(inode); int mds; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; } /* - * Called under i_lock. + * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) @@ -415,7 +412,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * * If I_FLUSH is set, leave the inode at the front of the list. * - * Caller holds i_lock + * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, @@ -457,7 +454,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, /* * Cancel delayed work on cap. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) @@ -483,11 +480,12 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * i_rdcache_gen. */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ @@ -496,7 +494,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + __ceph_dir_clear_complete(ci); } } } @@ -510,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; @@ -533,42 +530,44 @@ int ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); -retry: - spin_lock(&inode->i_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&inode->i_lock); - new_cap = get_cap(mdsc, caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); + } else { + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } } if (!ci->i_snap_realm) { @@ -607,10 +606,15 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } + } else { + WARN_ON(ci->i_auth_cap == cap); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -618,7 +622,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (ceph_seq_cmp(mseq, cap->mseq) > 0) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -626,9 +633,6 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); - wake_up_all(&ci->i_cap_wq); - return 0; } /* @@ -641,10 +645,10 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_cap_lock); + spin_lock(&cap->session->s_gen_ttl_lock); gen = cap->session->s_cap_gen; ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_cap_lock); + spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " @@ -663,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -679,6 +683,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -765,7 +778,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (touch) { struct rb_node *q; - /* touch this + preceeding caps */ + /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { @@ -786,23 +799,29 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&inode->i_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && + (cap->implemented & ~cap->issued & mask)) + return 1; } - spin_unlock(&inode->i_lock); + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); + spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); return ret; @@ -819,7 +838,7 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; - if (ci->i_wrbuffer_ref) + if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; return used; } @@ -850,26 +869,41 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } /* - * called under i_lock + * called under i_ceph_lock */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * - * caller should hold i_lock. + * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; @@ -881,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap) /* remove from session list */ spin_lock(&session->s_cap_lock); + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -928,7 +972,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, mode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -944,7 +988,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -972,8 +1016,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -987,15 +1031,14 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1005,7 +1048,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); @@ -1024,12 +1067,11 @@ static void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_lock. + * inode is about to be destroyed, there is no need for i_ceph_lock. */ void ceph_queue_caps_release(struct inode *inode) { @@ -1039,18 +1081,14 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } /* * Send a cap msg on the given inode. Update our caps state, then - * drop i_lock and send the message. + * drop i_ceph_lock and send the message. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. @@ -1062,13 +1100,13 @@ void ceph_queue_caps_release(struct inode *inode) * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * - * called with i_lock, then drops it. + * called with i_ceph_lock, then drops it. * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, unsigned *pflush_tid) - __releases(cap->ci->vfs_inode->i_lock) + __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; @@ -1078,9 +1116,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 size, max_size; struct timespec mtime, atime; int wake = 0; - mode_t mode; - uid_t uid; - gid_t gid; + umode_t mode; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; @@ -1171,7 +1209,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, @@ -1199,13 +1237,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * Unless @again is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * - * Called under i_lock. Takes s_mutex as needed. + * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, int again) - __releases(ci->vfs_inode->i_lock) - __acquires(ci->vfs_inode->i_lock) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; int mds; @@ -1262,7 +1300,7 @@ retry: session = NULL; } if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); @@ -1276,7 +1314,7 @@ retry: * deletion or migration. retry, and we'll * get a better @mds value next time. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1286,7 +1324,7 @@ retry: list_del_init(&capsnap->flushing_item); list_add_tail(&capsnap->flushing_item, &session->s_cap_snaps_flushing); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); @@ -1303,7 +1341,7 @@ retry: next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1323,18 +1361,17 @@ out: static void ceph_flush_snaps(struct ceph_inode_info *ci) { - struct inode *inode = &ci->vfs_inode; - - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_flush_snaps(ci, NULL, 0); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* - * Mark caps dirty. If inode is newly dirty, add to the global dirty - * list. + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. */ -void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1350,14 +1387,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (!ci->i_head_snapc) ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, - ci->i_head_snapc); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { - igrab(inode); + ihold(inode); dirty |= I_DIRTY_SYNC; } } @@ -1365,16 +1403,15 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - if (dirty) - __mark_inode_dirty(inode, dirty); __cap_delay_requeue(mdsc, ci); + return dirty; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_lock. + * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) @@ -1422,16 +1459,16 @@ static int try_nonblocking_invalidate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); invalidate_mapping_pages(&inode->i_data, 0, -1); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); @@ -1456,7 +1493,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1471,7 +1508,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (mdsc->stopping) is_delayed = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; @@ -1481,7 +1518,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); @@ -1559,10 +1596,16 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - if (revoking) - dout(" mds%d revoking %s\n", cap->mds, - ceph_cap_string(revoking)); + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { @@ -1588,7 +1631,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1634,7 +1677,7 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; @@ -1648,7 +1691,7 @@ ack: if (down_read_trylock(&mdsc->snap_rwsem) == 0) { dout("inverting snap/in locks on %p\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); took_snap_rwsem = 1; goto retry; @@ -1658,14 +1701,16 @@ ack: if (cap == ci->i_auth_cap && ci->i_dirty_caps) flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; mds = cap->mds; /* remember mds, so we don't repeat */ sent++; - /* __send_cap drops i_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); - goto retry; /* retake i_lock and restart our cap scan. */ + /* __send_cap drops i_ceph_lock */ + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* @@ -1679,7 +1724,7 @@ ack: else if (!is_delayed || force_requeue) __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_invalidate) ceph_queue_invalidate(inode); @@ -1693,16 +1738,15 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; @@ -1713,32 +1757,33 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { - spin_unlock(&inode->i_lock); + if (!session || session != cap->session) { + spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; flushing = __mark_caps_flushing(inode, session); - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, cap->issued | cap->implemented, flushing, flush_tid); if (!delayed) goto out_unlocked; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1751,7 +1796,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) struct ceph_inode_info *ci = ceph_inode(inode); int i, ret = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1759,7 +1804,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) ret = 0; break; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1807,7 +1852,7 @@ out: spin_unlock(&ci->i_unsafe_lock); } -int ceph_fsync(struct file *file, int datasync) +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1818,11 +1863,12 @@ int ceph_fsync(struct file *file, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); sync_write_wait(inode); - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) return ret; + mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1837,6 +1883,7 @@ int ceph_fsync(struct file *file, int datasync) } dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + mutex_unlock(&inode->i_mutex); return ret; } @@ -1856,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); @@ -1864,10 +1911,10 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return err; } @@ -1890,7 +1937,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, @@ -1900,7 +1947,7 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1917,7 +1964,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p %s\n", inode, @@ -1928,15 +1975,51 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + } + } +} + +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + + __ceph_flush_snaps(ci, &session, 1); + + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); } + } else { + spin_unlock(&ci->i_ceph_lock); } } @@ -1945,7 +2028,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. * - * Protected by i_lock. + * Protected by i_ceph_lock. */ static void __take_cap_refs(struct ceph_inode_info *ci, int got) { @@ -1958,11 +2041,11 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) if (got & CEPH_CAP_FILE_WR) ci->i_wr_ref++; if (got & CEPH_CAP_FILE_BUFFER) { - if (ci->i_wrbuffer_ref == 0) - igrab(&ci->vfs_inode); - ci->i_wrbuffer_ref++; - dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", - &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); + if (ci->i_wb_ref == 0) + ihold(&ci->vfs_inode); + ci->i_wb_ref++; + dout("__take_cap_refs %p wb %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -1983,7 +2066,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); @@ -1995,11 +2078,20 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } - if (need & CEPH_CAP_FILE_WR) { + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode); + spin_lock(&ci->i_ceph_lock); + } + + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { + if (endoff > ci->i_requested_max_size) { *check_max = 1; ret = 1; } @@ -2014,13 +2106,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } } - have = __ceph_caps_issued(ci, &implemented); - - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; if ((have & need) == need) { /* @@ -2044,7 +2129,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(have), ceph_cap_string(need)); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; @@ -2061,16 +2146,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) int check = 0; /* do we need to explicitly request a larger max_size? */ - spin_lock(&inode->i_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { + spin_lock(&ci->i_ceph_lock); + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { dout("write %p at large endoff %llu, req max_size\n", inode, endoff); ci->i_wanted_max_size = endoff; - check = 1; } - spin_unlock(&inode->i_lock); + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } @@ -2107,9 +2195,9 @@ retry: */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); __take_cap_refs(ci, caps); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2127,7 +2215,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) int last = 0, put = 0, flushsnaps = 0, wake = 0; struct ceph_cap_snap *capsnap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) @@ -2137,12 +2225,12 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { - if (--ci->i_wrbuffer_ref == 0) { + if (--ci->i_wb_ref == 0) { last++; put++; } - dout("put_cap_refs %p wrbuffer %d -> %d (?)\n", - inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref); + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { @@ -2160,7 +2248,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); @@ -2192,7 +2280,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, int found = 0; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; last = !ci->i_wrbuffer_ref; @@ -2241,7 +2329,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2255,42 +2343,88 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * - * caller holds s_mutex and i_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) + * caller holds s_mutex and i_ceph_lock, we drop both. */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + void *snaptrace, int snaptrace_len, + struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - int wake = 0; - int writeback = 0; - int revoked_rdcache = 0; - int queue_invalidate = 0; - - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + bool wake = 0; + bool writeback = 0; + bool queue_trunc = 0; + bool queue_invalidate = 0; + bool queue_revalidate = 0; + bool deleted_inode = 0; + + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we @@ -2299,9 +2433,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2309,27 +2441,33 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } + + ceph_fscache_invalidate(inode); } /* side effects now are allowed */ - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { + set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = 1; + } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2342,29 +2480,44 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - /* max size increase? */ - if (max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = 1; + + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; } - wake = 1; } /* check cap bits */ @@ -2379,15 +2532,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } - cap->seq = seq; - cap->issue_seq = issue_seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; @@ -2414,6 +2563,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -2422,7 +2576,25 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } BUG_ON(cap->issued & ~cap->implemented); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, false); + downgrade_write(&mdsc->snap_rwsem); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = 1; + } + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2432,6 +2604,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2452,7 +2626,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -2508,7 +2682,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, wake_up_all(&ci->i_cap_wq); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2531,7 +2705,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { if (capsnap->flush_tid != flush_tid) { @@ -2554,7 +2728,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2567,7 +2741,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, static void handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2586,10 +2760,12 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) + if (queue_trunc) { ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } } /* @@ -2601,96 +2777,200 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } - spin_lock(&inode->i_lock); + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ + + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } } - if (t->session->s_mds == mds) - cap = t; + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); } - __ceph_remove_cap(cap); + ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; } - /* else, we already released it */ + goto retry; - spin_unlock(&inode->i_lock); +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; + + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } + + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; + } + cap = new_cap; } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - try_flush_caps(inode, session, NULL); - up_read(&mdsc->snap_rwsem); + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + *old_issued = issued; + *target_cap = cap; } /* @@ -2705,10 +2985,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_mds_client *mdsc = session->s_mdsc; struct super_block *sb = mdsc->fsc->sb; struct inode *inode; + struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; - int op; + int op, issued; u32 seq, mseq; struct ceph_vino vino; u64 cap_id; @@ -2717,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; - int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2740,32 +3023,50 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); + spin_unlock(&session->s_cap_lock); + } goto flush_cap_releases; } @@ -2776,32 +3077,35 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, - snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, - session); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &issued); + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + msg->middle, session, cap, issued); goto done_unlocked; } /* the rest require a cap */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto flush_cap_releases; } - /* note that each of these drops i_lock for us */ + /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, + session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -2813,7 +3117,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; default: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } @@ -2834,8 +3138,6 @@ done: done_unlocked: if (inode) iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: @@ -2876,47 +3178,24 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = &ci->vfs_inode; + ihold(inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* @@ -2929,13 +3208,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) struct inode *inode = &ci->vfs_inode; int last = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); BUG_ON(ci->i_nr_by_mode[fmode] == 0); if (--ci->i_nr_by_mode[fmode] == 0) last++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) ceph_check_caps(ci, 0, NULL); @@ -2958,7 +3237,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int used, dirty; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -2976,21 +3255,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, @@ -3002,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; @@ -3013,7 +3290,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, inode, cap, ceph_cap_string(cap->issued)); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -3028,7 +3305,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, /* * force an record for the directory caps if we have a dentry lease. - * this is racy (can't take i_lock and d_lock together), but it + * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ |
