diff options
Diffstat (limited to 'fs/ceph/mds_client.c')
-rw-r--r-- | fs/ceph/mds_client.c | 3021 |
1 files changed, 3021 insertions, 0 deletions
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c new file mode 100644 index 00000000000..a2600101ec2 --- /dev/null +++ b/fs/ceph/mds_client.c @@ -0,0 +1,3021 @@ +#include "ceph_debug.h" + +#include <linux/wait.h> +#include <linux/sched.h> + +#include "mds_client.h" +#include "mon_client.h" +#include "super.h" +#include "messenger.h" +#include "decode.h" +#include "auth.h" +#include "pagelist.h" + +/* + * A cluster of MDS (metadata server) daemons is responsible for + * managing the file system namespace (the directory hierarchy and + * inodes) and for coordinating shared access to storage. Metadata is + * partitioning hierarchically across a number of servers, and that + * partition varies over time as the cluster adjusts the distribution + * in order to balance load. + * + * The MDS client is primarily responsible to managing synchronous + * metadata requests for operations like open, unlink, and so forth. + * If there is a MDS failure, we find out about it when we (possibly + * request and) receive a new MDS map, and can resubmit affected + * requests. + * + * For the most part, though, we take advantage of a lossless + * communications channel to the MDS, and do not need to worry about + * timing out or resubmitting requests. + * + * We maintain a stateful "session" with each MDS we interact with. + * Within each session, we sent periodic heartbeat messages to ensure + * any capabilities or leases we have been issues remain valid. If + * the session times out and goes stale, our leases and capabilities + * are no longer valid. + */ + +static void __wake_requests(struct ceph_mds_client *mdsc, + struct list_head *head); + +const static struct ceph_connection_operations mds_con_ops; + + +/* + * mds reply parsing + */ + +/* + * parse individual inode info + */ +static int parse_reply_info_in(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + int err = -EIO; + + info->in = *p; + *p += sizeof(struct ceph_mds_reply_inode) + + sizeof(*info->in->fragtree.splits) * + le32_to_cpu(info->in->fragtree.nsplits); + + ceph_decode_32_safe(p, end, info->symlink_len, bad); + ceph_decode_need(p, end, info->symlink_len, bad); + info->symlink = *p; + *p += info->symlink_len; + + ceph_decode_32_safe(p, end, info->xattr_len, bad); + ceph_decode_need(p, end, info->xattr_len, bad); + info->xattr_data = *p; + *p += info->xattr_len; + return 0; +bad: + return err; +} + +/* + * parse a normal reply, which may contain a (dir+)dentry and/or a + * target inode. + */ +static int parse_reply_info_trace(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + int err; + + if (info->head->is_dentry) { + err = parse_reply_info_in(p, end, &info->diri); + if (err < 0) + goto out_bad; + + if (unlikely(*p + sizeof(*info->dirfrag) > end)) + goto bad; + info->dirfrag = *p; + *p += sizeof(*info->dirfrag) + + sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); + if (unlikely(*p > end)) + goto bad; + + ceph_decode_32_safe(p, end, info->dname_len, bad); + ceph_decode_need(p, end, info->dname_len, bad); + info->dname = *p; + *p += info->dname_len; + info->dlease = *p; + *p += sizeof(*info->dlease); + } + + if (info->head->is_target) { + err = parse_reply_info_in(p, end, &info->targeti); + if (err < 0) + goto out_bad; + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing mds trace %d\n", err); + return err; +} + +/* + * parse readdir results + */ +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + u32 num, i = 0; + int err; + + info->dir_dir = *p; + if (*p + sizeof(*info->dir_dir) > end) + goto bad; + *p += sizeof(*info->dir_dir) + + sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); + if (*p > end) + goto bad; + + ceph_decode_need(p, end, sizeof(num) + 2, bad); + num = ceph_decode_32(p); + info->dir_end = ceph_decode_8(p); + info->dir_complete = ceph_decode_8(p); + if (num == 0) + goto done; + + /* alloc large array */ + info->dir_nr = num; + info->dir_in = kcalloc(num, sizeof(*info->dir_in) + + sizeof(*info->dir_dname) + + sizeof(*info->dir_dname_len) + + sizeof(*info->dir_dlease), + GFP_NOFS); + if (info->dir_in == NULL) { + err = -ENOMEM; + goto out_bad; + } + info->dir_dname = (void *)(info->dir_in + num); + info->dir_dname_len = (void *)(info->dir_dname + num); + info->dir_dlease = (void *)(info->dir_dname_len + num); + + while (num) { + /* dentry */ + ceph_decode_need(p, end, sizeof(u32)*2, bad); + info->dir_dname_len[i] = ceph_decode_32(p); + ceph_decode_need(p, end, info->dir_dname_len[i], bad); + info->dir_dname[i] = *p; + *p += info->dir_dname_len[i]; + dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], + info->dir_dname[i]); + info->dir_dlease[i] = *p; + *p += sizeof(struct ceph_mds_reply_lease); + + /* inode */ + err = parse_reply_info_in(p, end, &info->dir_in[i]); + if (err < 0) + goto out_bad; + i++; + num--; + } + +done: + if (*p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("problem parsing dir contents %d\n", err); + return err; +} + +/* + * parse entire mds reply + */ +static int parse_reply_info(struct ceph_msg *msg, + struct ceph_mds_reply_info_parsed *info) +{ + void *p, *end; + u32 len; + int err; + + info->head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head); + end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head); + + /* trace */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_trace(&p, p+len, info); + if (err < 0) + goto out_bad; + } + + /* dir content */ + ceph_decode_32_safe(&p, end, len, bad); + if (len > 0) { + err = parse_reply_info_dir(&p, p+len, info); + if (err < 0) + goto out_bad; + } + + /* snap blob */ + ceph_decode_32_safe(&p, end, len, bad); + info->snapblob_len = len; + info->snapblob = p; + p += len; + + if (p != end) + goto bad; + return 0; + +bad: + err = -EIO; +out_bad: + pr_err("mds parse_reply err %d\n", err); + return err; +} + +static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) +{ + kfree(info->dir_in); +} + + +/* + * sessions + */ +static const char *session_state_name(int s) +{ + switch (s) { + case CEPH_MDS_SESSION_NEW: return "new"; + case CEPH_MDS_SESSION_OPENING: return "opening"; + case CEPH_MDS_SESSION_OPEN: return "open"; + case CEPH_MDS_SESSION_HUNG: return "hung"; + case CEPH_MDS_SESSION_CLOSING: return "closing"; + case CEPH_MDS_SESSION_RESTARTING: return "restarting"; + case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; + default: return "???"; + } +} + +static struct ceph_mds_session *get_session(struct ceph_mds_session *s) +{ + if (atomic_inc_not_zero(&s->s_ref)) { + dout("mdsc get_session %p %d -> %d\n", s, + atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + return s; + } else { + dout("mdsc get_session %p 0 -- FAIL", s); + return NULL; + } +} + +void ceph_put_mds_session(struct ceph_mds_session *s) +{ + dout("mdsc put_session %p %d -> %d\n", s, + atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); + if (atomic_dec_and_test(&s->s_ref)) { + if (s->s_authorizer) + s->s_mdsc->client->monc.auth->ops->destroy_authorizer( + s->s_mdsc->client->monc.auth, s->s_authorizer); + kfree(s); + } +} + +/* + * called under mdsc->mutex + */ +struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *session; + + if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + return NULL; + session = mdsc->sessions[mds]; + dout("lookup_mds_session %p %d\n", session, + atomic_read(&session->s_ref)); + get_session(session); + return session; +} + +static bool __have_session(struct ceph_mds_client *mdsc, int mds) +{ + if (mds >= mdsc->max_sessions) + return false; + return mdsc->sessions[mds]; +} + +static int __verify_registered_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + if (s->s_mds >= mdsc->max_sessions || + mdsc->sessions[s->s_mds] != s) + return -ENOENT; + return 0; +} + +/* + * create+register a new session for given mds. + * called under mdsc->mutex. + */ +static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, + int mds) +{ + struct ceph_mds_session *s; + + s = kzalloc(sizeof(*s), GFP_NOFS); + s->s_mdsc = mdsc; + s->s_mds = mds; + s->s_state = CEPH_MDS_SESSION_NEW; + s->s_ttl = 0; + s->s_seq = 0; + mutex_init(&s->s_mutex); + + ceph_con_init(mdsc->client->msgr, &s->s_con); + s->s_con.private = s; + s->s_con.ops = &mds_con_ops; + s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; + s->s_con.peer_name.num = cpu_to_le64(mds); + + spin_lock_init(&s->s_cap_lock); + s->s_cap_gen = 0; + s->s_cap_ttl = 0; + s->s_renew_requested = 0; + s->s_renew_seq = 0; + INIT_LIST_HEAD(&s->s_caps); + s->s_nr_caps = 0; + s->s_trim_caps = 0; + atomic_set(&s->s_ref, 1); + INIT_LIST_HEAD(&s->s_waiting); + INIT_LIST_HEAD(&s->s_unsafe); + s->s_num_cap_releases = 0; + s->s_cap_iterator = NULL; + INIT_LIST_HEAD(&s->s_cap_releases); + INIT_LIST_HEAD(&s->s_cap_releases_done); + INIT_LIST_HEAD(&s->s_cap_flushing); + INIT_LIST_HEAD(&s->s_cap_snaps_flushing); + + dout("register_session mds%d\n", mds); + if (mds >= mdsc->max_sessions) { + int newmax = 1 << get_count_order(mds+1); + struct ceph_mds_session **sa; + + dout("register_session realloc to %d\n", newmax); + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + if (sa == NULL) + goto fail_realloc; + if (mdsc->sessions) { + memcpy(sa, mdsc->sessions, + mdsc->max_sessions * sizeof(void *)); + kfree(mdsc->sessions); + } + mdsc->sessions = sa; + mdsc->max_sessions = newmax; + } + mdsc->sessions[mds] = s; + atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + + ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + return s; + +fail_realloc: + kfree(s); + return ERR_PTR(-ENOMEM); +} + +/* + * called under mdsc->mutex + */ +static void __unregister_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *s) +{ + dout("__unregister_session mds%d %p\n", s->s_mds, s); + BUG_ON(mdsc->sessions[s->s_mds] != s); + mdsc->sessions[s->s_mds] = NULL; + ceph_con_close(&s->s_con); + ceph_put_mds_session(s); +} + +/* + * drop session refs in request. + * + * should be last request ref, or hold mdsc->mutex + */ +static void put_request_session(struct ceph_mds_request *req) +{ + if (req->r_session) { + ceph_put_mds_session(req->r_session); + req->r_session = NULL; + } +} + +void ceph_mdsc_release_request(struct kref *kref) +{ + struct ceph_mds_request *req = container_of(kref, + struct ceph_mds_request, + r_kref); + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) { + ceph_msg_put(req->r_reply); + destroy_reply_info(&req->r_reply_info); + } + if (req->r_inode) { + ceph_put_cap_refs(ceph_inode(req->r_inode), + CEPH_CAP_PIN); + iput(req->r_inode); + } + if (req->r_locked_dir) + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), + CEPH_CAP_PIN); + if (req->r_target_inode) + iput(req->r_target_inode); + if (req->r_dentry) + dput(req->r_dentry); + if (req->r_old_dentry) { + ceph_put_cap_refs( + ceph_inode(req->r_old_dentry->d_parent->d_inode), + CEPH_CAP_PIN); + dput(req->r_old_dentry); + } + kfree(req->r_path1); + kfree(req->r_path2); + put_request_session(req); + ceph_unreserve_caps(&req->r_caps_reservation); + kfree(req); +} + +/* + * lookup session, bump ref if found. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, + u64 tid) +{ + struct ceph_mds_request *req; + struct rb_node *n = mdsc->request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mds_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else { + ceph_mdsc_get_request(req); + return req; + } + } + return NULL; +} + +static void __insert_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *new) +{ + struct rb_node **p = &mdsc->request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mds_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mds_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &mdsc->request_tree); +} + +/* + * Register an in-flight request, and assign a tid. Link to directory + * are modifying (if any). + * + * Called under mdsc->mutex. + */ +static void __register_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + struct inode *dir) +{ + req->r_tid = ++mdsc->last_tid; + if (req->r_num_caps) + ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); + dout("__register_request %p tid %lld\n", req, req->r_tid); + ceph_mdsc_get_request(req); + __insert_request(mdsc, req); + + if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + + spin_lock(&ci->i_unsafe_lock); + req->r_unsafe_dir = dir; + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); + } +} + +static void __unregister_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &mdsc->request_tree); + ceph_mdsc_put_request(req); + + if (req->r_unsafe_dir) { + struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); + + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_dir_item); + spin_unlock(&ci->i_unsafe_lock); + } +} + +/* + * Choose mds to send request to next. If there is a hint set in the + * request (e.g., due to a prior forward hint from the mds), use that. + * Otherwise, consult frag tree and/or caps to identify the + * appropriate mds. If all else fails, choose randomly. + * + * Called under mdsc->mutex. + */ +static int __choose_mds(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_cap *cap; + int mode = req->r_direct_mode; + int mds = -1; + u32 hash = req->r_direct_hash; + bool is_hash = req->r_direct_is_hash; + + /* + * is there a specific mds we should try? ignore hint if we have + * no session and the mds is not up (active or recovering). + */ + if (req->r_resend_mds >= 0 && + (__have_session(mdsc, req->r_resend_mds) || + ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { + dout("choose_mds using resend_mds mds%d\n", + req->r_resend_mds); + return req->r_resend_mds; + } + + if (mode == USE_RANDOM_MDS) + goto random; + + inode = NULL; + if (req->r_inode) { + inode = req->r_inode; + } else if (req->r_dentry) { + if (req->r_dentry->d_inode) { + inode = req->r_dentry->d_inode; + } else { + inode = req->r_dentry->d_parent->d_inode; + hash = req->r_dentry->d_name.hash; + is_hash = true; + } + } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, + (int)hash, mode); + if (!inode) + goto random; + ci = ceph_inode(inode); + + if (is_hash && S_ISDIR(inode->i_mode)) { + struct ceph_inode_frag frag; + int found; + + ceph_choose_frag(ci, hash, &frag, &found); + if (found) { + if (mode == USE_ANY_MDS && frag.ndist > 0) { + u8 r; + + /* choose a random replica */ + get_random_bytes(&r, 1); + r %= frag.ndist; + mds = frag.dist[r]; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (%d/%d)\n", + inode, ceph_vinop(inode), + frag.frag, frag.mds, + (int)r, frag.ndist); + return mds; + } + + /* since this file/dir wasn't known to be + * replicated, then we want to look for the + * authoritative mds. */ + mode = USE_AUTH_MDS; + if (frag.mds >= 0) { + /* choose auth mds */ + mds = frag.mds; + dout("choose_mds %p %llx.%llx " + "frag %u mds%d (auth)\n", + inode, ceph_vinop(inode), frag.frag, mds); + return mds; + } + } + } + + spin_lock(&inode->i_lock); + cap = NULL; + if (mode == USE_AUTH_MDS) + cap = ci->i_auth_cap; + if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) + cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); + if (!cap) { + spin_unlock(&inode->i_lock); + goto random; + } + mds = cap->session->s_mds; + dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", + inode, ceph_vinop(inode), mds, + cap == ci->i_auth_cap ? "auth " : "", cap); + spin_unlock(&inode->i_lock); + return mds; + +random: + mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); + dout("choose_mds chose random mds%d\n", mds); + return mds; +} + + +/* + * session messages + */ +static struct ceph_msg *create_session_msg(u32 op, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); + if (IS_ERR(msg)) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return ERR_PTR(PTR_ERR(msg)); + } + h = msg->front.iov_base; + h->op = cpu_to_le32(op); + h->seq = cpu_to_le64(seq); + return msg; +} + +/* + * send session open request. + * + * called under mdsc->mutex + */ +static int __open_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int mstate; + int mds = session->s_mds; + int err = 0; + + /* wait for mds to go active? */ + mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); + dout("open_session to mds%d (%s)\n", mds, + ceph_mds_state_name(mstate)); + session->s_state = CEPH_MDS_SESSION_OPENING; + session->s_renew_requested = jiffies; + + /* send connect message */ + msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); + if (IS_ERR(msg)) { + err = PTR_ERR(msg); + goto out; + } + ceph_con_send(&session->s_con, msg); + +out: + return 0; +} + +/* + * session caps + */ + +/* + * Free preallocated cap messages assigned to this session + */ +static void cleanup_cap_releases(struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + ceph_msg_put(msg); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * Helper to safely iterate over all caps associated with a session. + * + * caller must hold session s_mutex + */ +static int iterate_session_caps(struct ceph_mds_session *session, + int (*cb)(struct inode *, struct ceph_cap *, + void *), void *arg) +{ + struct list_head *p; + struct ceph_cap *cap; + struct inode *inode, *last_inode = NULL; + struct ceph_cap *old_cap = NULL; + int ret; + + dout("iterate_session_caps %p mds%d\n", session, session->s_mds); + spin_lock(&session->s_cap_lock); + p = session->s_caps.next; + while (p != &session->s_caps) { + cap = list_entry(p, struct ceph_cap, session_caps); + inode = igrab(&cap->ci->vfs_inode); + if (!inode) { + p = p->next; + continue; + } + session->s_cap_iterator = cap; + spin_unlock(&session->s_cap_lock); + + if (last_inode) { + iput(last_inode); + last_inode = NULL; + } + if (old_cap) { + ceph_put_cap(old_cap); + old_cap = NULL; + } + + ret = cb(inode, cap, arg); + last_inode = inode; + + spin_lock(&session->s_cap_lock); + p = p->next; + if (cap->ci == NULL) { + dout("iterate_session_caps finishing cap %p removal\n", + cap); + BUG_ON(cap->session != session); + list_del_init(&cap->session_caps); + session->s_nr_caps--; + cap->session = NULL; + old_cap = cap; /* put_cap it w/o locks held */ + } + if (ret < 0) + goto out; + } + ret = 0; +out: + session->s_cap_iterator = NULL; + spin_unlock(&session->s_cap_lock); + + if (last_inode) + iput(last_inode); + if (old_cap) + ceph_put_cap(old_cap); + + return ret; +} + +static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + ceph_remove_cap(cap); + return 0; +} + +/* + * caller must hold session s_mutex + */ +static void remove_session_caps(struct ceph_mds_session *session) +{ + dout("remove_session_caps on %p\n", session); + iterate_session_caps(session, remove_session_caps_cb, NULL); + BUG_ON(session->s_nr_caps > 0); + cleanup_cap_releases(session); +} + +/* + * wake up any threads waiting on this session's caps. if the cap is + * old (didn't get renewed on the client reconnect), remove it now. + * + * caller must hold s_mutex. + */ +static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, + void *arg) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + wake_up(&ci->i_cap_wq); + if (arg) { + spin_lock(&inode->i_lock); + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); + } + return 0; +} + +static void wake_up_session_caps(struct ceph_mds_session *session, + int reconnect) +{ + dout("wake_up_session_caps %p mds%d\n", session, session->s_mds); + iterate_session_caps(session, wake_up_session_cb, + (void *)(unsigned long)reconnect); +} + +/* + * Send periodic message to MDS renewing all currently held caps. The + * ack will reset the expiration for all caps from this session. + * + * caller holds s_mutex + */ +static int send_renew_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int state; + + if (time_after_eq(jiffies, session->s_cap_ttl) && + time_after_eq(session->s_cap_ttl, session->s_renew_requested)) + pr_info("mds%d caps stale\n", session->s_mds); + + /* do not try to renew caps until a recovering mds has reconnected + * with its clients. */ + state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds); + if (state < CEPH_MDS_STATE_RECONNECT) { + dout("send_renew_caps ignoring mds%d (%s)\n", + session->s_mds, ceph_mds_state_name(state)); + return 0; + } + + dout("send_renew_caps to mds%d (%s)\n", session->s_mds, + ceph_mds_state_name(state)); + session->s_renew_requested = jiffies; + msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); + if (IS_ERR(msg)) + return PTR_ERR(msg); + ceph_con_send(&session->s_con, msg); + return 0; +} + +/* + * Note new cap ttl, and any transition from stale -> not stale (fresh?). + * + * Called under session->s_mutex + */ +static void renewed_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, int is_renew) +{ + int was_stale; + int wake = 0; + + spin_lock(&session->s_cap_lock); + was_stale = is_renew && (session->s_cap_ttl == 0 || + time_after_eq(jiffies, session->s_cap_ttl)); + + session->s_cap_ttl = session->s_renew_requested + + mdsc->mdsmap->m_session_timeout*HZ; + + if (was_stale) { + if (time_before(jiffies, session->s_cap_ttl)) { + pr_info("mds%d caps renewed\n", session->s_mds); + wake = 1; + } else { + pr_info("mds%d caps still stale\n", session->s_mds); + } + } + dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n", + session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh", + time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh"); + spin_unlock(&session->s_cap_lock); + + if (wake) + wake_up_session_caps(session, 0); +} + +/* + * send a session close request + */ +static int request_close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + int err = 0; + + dout("request_close_session mds%d state %s seq %lld\n", + session->s_mds, session_state_name(session->s_state), + session->s_seq); + msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + if (IS_ERR(msg)) + err = PTR_ERR(msg); + else + ceph_con_send(&session->s_con, msg); + return err; +} + +/* + * Called with s_mutex held. + */ +static int __close_session(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (session->s_state >= CEPH_MDS_SESSION_CLOSING) + return 0; + session->s_state = CEPH_MDS_SESSION_CLOSING; + return request_close_session(mdsc, session); +} + +/* + * Trim old(er) caps. + * + * Because we can't cache an inode without one or more caps, we do + * this indirectly: if a cap is unused, we prune its aliases, at which + * point the inode will hopefully get dropped to. + * + * Yes, this is a bit sloppy. Our only real goal here is to respond to + * memory pressure from the MDS, though, so it needn't be perfect. + */ +static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) +{ + struct ceph_mds_session *session = arg; + struct ceph_inode_info *ci = ceph_inode(inode); + int used, oissued, mine; + + if (session->s_trim_caps <= 0) + return -1; + + spin_lock(&inode->i_lock); + mine = cap->issued | cap->implemented; + used = __ceph_caps_used(ci); + oissued = __ceph_caps_issued_other(ci, cap); + + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), + ceph_cap_string(used)); + if (ci->i_dirty_caps) + goto out; /* dirty caps */ + if ((used & ~oissued) & mine) + goto out; /* we need these caps */ + + session->s_trim_caps--; + if (oissued) { + /* we aren't the only cap.. just remove us */ + __ceph_remove_cap(cap); + } else { + /* try to drop referring dentries */ + spin_unlock(&inode->i_lock); + d_prune_aliases(inode); + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, atomic_read(&inode->i_count)); + return 0; + } + +out: + spin_unlock(&inode->i_lock); + return 0; +} + +/* + * Trim session cap count down to some max number. + */ +static int trim_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int max_caps) +{ + int trim_caps = session->s_nr_caps - max_caps; + + dout("trim_caps mds%d start: %d / %d, trim %d\n", + session->s_mds, session->s_nr_caps, max_caps, trim_caps); + if (trim_caps > 0) { + session->s_trim_caps = trim_caps; + iterate_session_caps(session, trim_caps_cb, session); + dout("trim_caps mds%d done: %d / %d, trimmed %d\n", + session->s_mds, session->s_nr_caps, max_caps, + trim_caps - session->s_trim_caps); + session->s_trim_caps = 0; + } + return 0; +} + +/* + * Allocate cap_release messages. If there is a partially full message + * in the queue, try to allocate enough to cover it's remainder, so that + * we can send it immediately. + * + * Called under s_mutex. + */ +static int add_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int extra) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + int err = -ENOMEM; + + if (extra < 0) + extra = mdsc->client->mount_args->cap_release_safety; + + spin_lock(&session->s_cap_lock); + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + } + + while (session->s_num_cap_releases < session->s_nr_caps + extra) { + spin_unlock(&session->s_cap_lock); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, + 0, 0, NULL); + if (!msg) + goto out_unlocked; + dout("add_cap_releases %p msg %p now %d\n", session, msg, + (int)msg->front.iov_len); + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + spin_lock(&session->s_cap_lock); + list_add(&msg->list_head, &session->s_cap_releases); + session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; + } + + if (!list_empty(&session->s_cap_releases)) { + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, + list_head); + head = msg->front.iov_base; + if (head->num) { + dout(" queueing non-full %p (%d)\n", msg, + le32_to_cpu(head->num)); + list_move_tail(&msg->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= + CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + } + } + err = 0; + spin_unlock(&session->s_cap_lock); +out_unlocked: + return err; +} + +/* + * flush all dirty inode data to disk. + * + * returns true if we've flushed through want_flush_seq + */ +static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +{ + int mds, ret = 1; + + dout("check_cap_flush want %lld\n", want_flush_seq); + mutex_lock(&mdsc->mutex); + for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = mdsc->sessions[mds]; + + if (!session) + continue; + get_session(session); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&session->s_mutex); + if (!list_empty(&session->s_cap_flushing)) { + struct ceph_inode_info *ci = + list_entry(session->s_cap_flushing.next, + struct ceph_inode_info, + i_flushing_item); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&inode->i_lock); + if (ci->i_cap_flush_seq <= want_flush_seq) { + dout("check_cap_flush still flushing %p " + "seq %lld <= %lld to mds%d\n", inode, + ci->i_cap_flush_seq, want_flush_seq, + session->s_mds); + ret = 0; + } + spin_unlock(&inode->i_lock); + } + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); + + if (!ret) + return ret; + mutex_lock(&mdsc->mutex); + } + + mutex_unlock(&mdsc->mutex); + dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + return ret; +} + +/* + * called under s_mutex + */ +static void send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + + dout("send_cap_releases mds%d\n", session->s_mds); + while (1) { + spin_lock(&session->s_cap_lock); + if (list_empty(&session->s_cap_releases_done)) + break; + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + spin_unlock(&session->s_cap_lock); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + } + spin_unlock(&session->s_cap_lock); +} + +/* + * requests + */ + +/* + * Create an mds request. + */ +struct ceph_mds_request * +ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) +{ + struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + + if (!req) + return ERR_PTR(-ENOMEM); + + req->r_started = jiffies; + req->r_resend_mds = -1; + INIT_LIST_HEAD(&req->r_unsafe_dir_item); + req->r_fmode = -1; + kref_init(&req->r_kref); + INIT_LIST_HEAD(&req->r_wait); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + INIT_LIST_HEAD(&req->r_unsafe_item); + + req->r_op = op; + req->r_direct_mode = mode; + return req; +} + +/* + * return oldest (lowest) request, tid in request tree, 0 if none. + * + * called under mdsc->mutex. + */ +static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) +{ + if (RB_EMPTY_ROOT(&mdsc->request_tree)) + return NULL; + return rb_entry(rb_first(&mdsc->request_tree), + struct ceph_mds_request, r_node); +} + +static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +{ + struct ceph_mds_request *req = __get_oldest_r |