aboutsummaryrefslogtreecommitdiff
path: root/fs/ceph/mds_client.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/mds_client.c')
-rw-r--r--fs/ceph/mds_client.c3021
1 files changed, 3021 insertions, 0 deletions
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 00000000000..a2600101ec2
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3021 @@
+#include "ceph_debug.h"
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "mds_client.h"
+#include "mon_client.h"
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+#include "pagelist.h"
+
+/*
+ * A cluster of MDS (metadata server) daemons is responsible for
+ * managing the file system namespace (the directory hierarchy and
+ * inodes) and for coordinating shared access to storage. Metadata is
+ * partitioning hierarchically across a number of servers, and that
+ * partition varies over time as the cluster adjusts the distribution
+ * in order to balance load.
+ *
+ * The MDS client is primarily responsible to managing synchronous
+ * metadata requests for operations like open, unlink, and so forth.
+ * If there is a MDS failure, we find out about it when we (possibly
+ * request and) receive a new MDS map, and can resubmit affected
+ * requests.
+ *
+ * For the most part, though, we take advantage of a lossless
+ * communications channel to the MDS, and do not need to worry about
+ * timing out or resubmitting requests.
+ *
+ * We maintain a stateful "session" with each MDS we interact with.
+ * Within each session, we sent periodic heartbeat messages to ensure
+ * any capabilities or leases we have been issues remain valid. If
+ * the session times out and goes stale, our leases and capabilities
+ * are no longer valid.
+ */
+
+static void __wake_requests(struct ceph_mds_client *mdsc,
+ struct list_head *head);
+
+const static struct ceph_connection_operations mds_con_ops;
+
+
+/*
+ * mds reply parsing
+ */
+
+/*
+ * parse individual inode info
+ */
+static int parse_reply_info_in(void **p, void *end,
+ struct ceph_mds_reply_info_in *info)
+{
+ int err = -EIO;
+
+ info->in = *p;
+ *p += sizeof(struct ceph_mds_reply_inode) +
+ sizeof(*info->in->fragtree.splits) *
+ le32_to_cpu(info->in->fragtree.nsplits);
+
+ ceph_decode_32_safe(p, end, info->symlink_len, bad);
+ ceph_decode_need(p, end, info->symlink_len, bad);
+ info->symlink = *p;
+ *p += info->symlink_len;
+
+ ceph_decode_32_safe(p, end, info->xattr_len, bad);
+ ceph_decode_need(p, end, info->xattr_len, bad);
+ info->xattr_data = *p;
+ *p += info->xattr_len;
+ return 0;
+bad:
+ return err;
+}
+
+/*
+ * parse a normal reply, which may contain a (dir+)dentry and/or a
+ * target inode.
+ */
+static int parse_reply_info_trace(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ int err;
+
+ if (info->head->is_dentry) {
+ err = parse_reply_info_in(p, end, &info->diri);
+ if (err < 0)
+ goto out_bad;
+
+ if (unlikely(*p + sizeof(*info->dirfrag) > end))
+ goto bad;
+ info->dirfrag = *p;
+ *p += sizeof(*info->dirfrag) +
+ sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
+ if (unlikely(*p > end))
+ goto bad;
+
+ ceph_decode_32_safe(p, end, info->dname_len, bad);
+ ceph_decode_need(p, end, info->dname_len, bad);
+ info->dname = *p;
+ *p += info->dname_len;
+ info->dlease = *p;
+ *p += sizeof(*info->dlease);
+ }
+
+ if (info->head->is_target) {
+ err = parse_reply_info_in(p, end, &info->targeti);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing mds trace %d\n", err);
+ return err;
+}
+
+/*
+ * parse readdir results
+ */
+static int parse_reply_info_dir(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ u32 num, i = 0;
+ int err;
+
+ info->dir_dir = *p;
+ if (*p + sizeof(*info->dir_dir) > end)
+ goto bad;
+ *p += sizeof(*info->dir_dir) +
+ sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
+ if (*p > end)
+ goto bad;
+
+ ceph_decode_need(p, end, sizeof(num) + 2, bad);
+ num = ceph_decode_32(p);
+ info->dir_end = ceph_decode_8(p);
+ info->dir_complete = ceph_decode_8(p);
+ if (num == 0)
+ goto done;
+
+ /* alloc large array */
+ info->dir_nr = num;
+ info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
+ sizeof(*info->dir_dname) +
+ sizeof(*info->dir_dname_len) +
+ sizeof(*info->dir_dlease),
+ GFP_NOFS);
+ if (info->dir_in == NULL) {
+ err = -ENOMEM;
+ goto out_bad;
+ }
+ info->dir_dname = (void *)(info->dir_in + num);
+ info->dir_dname_len = (void *)(info->dir_dname + num);
+ info->dir_dlease = (void *)(info->dir_dname_len + num);
+
+ while (num) {
+ /* dentry */
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ info->dir_dname_len[i] = ceph_decode_32(p);
+ ceph_decode_need(p, end, info->dir_dname_len[i], bad);
+ info->dir_dname[i] = *p;
+ *p += info->dir_dname_len[i];
+ dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
+ info->dir_dname[i]);
+ info->dir_dlease[i] = *p;
+ *p += sizeof(struct ceph_mds_reply_lease);
+
+ /* inode */
+ err = parse_reply_info_in(p, end, &info->dir_in[i]);
+ if (err < 0)
+ goto out_bad;
+ i++;
+ num--;
+ }
+
+done:
+ if (*p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("problem parsing dir contents %d\n", err);
+ return err;
+}
+
+/*
+ * parse entire mds reply
+ */
+static int parse_reply_info(struct ceph_msg *msg,
+ struct ceph_mds_reply_info_parsed *info)
+{
+ void *p, *end;
+ u32 len;
+ int err;
+
+ info->head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
+ end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
+
+ /* trace */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ err = parse_reply_info_trace(&p, p+len, info);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* dir content */
+ ceph_decode_32_safe(&p, end, len, bad);
+ if (len > 0) {
+ err = parse_reply_info_dir(&p, p+len, info);
+ if (err < 0)
+ goto out_bad;
+ }
+
+ /* snap blob */
+ ceph_decode_32_safe(&p, end, len, bad);
+ info->snapblob_len = len;
+ info->snapblob = p;
+ p += len;
+
+ if (p != end)
+ goto bad;
+ return 0;
+
+bad:
+ err = -EIO;
+out_bad:
+ pr_err("mds parse_reply err %d\n", err);
+ return err;
+}
+
+static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
+{
+ kfree(info->dir_in);
+}
+
+
+/*
+ * sessions
+ */
+static const char *session_state_name(int s)
+{
+ switch (s) {
+ case CEPH_MDS_SESSION_NEW: return "new";
+ case CEPH_MDS_SESSION_OPENING: return "opening";
+ case CEPH_MDS_SESSION_OPEN: return "open";
+ case CEPH_MDS_SESSION_HUNG: return "hung";
+ case CEPH_MDS_SESSION_CLOSING: return "closing";
+ case CEPH_MDS_SESSION_RESTARTING: return "restarting";
+ case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
+ default: return "???";
+ }
+}
+
+static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
+{
+ if (atomic_inc_not_zero(&s->s_ref)) {
+ dout("mdsc get_session %p %d -> %d\n", s,
+ atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
+ return s;
+ } else {
+ dout("mdsc get_session %p 0 -- FAIL", s);
+ return NULL;
+ }
+}
+
+void ceph_put_mds_session(struct ceph_mds_session *s)
+{
+ dout("mdsc put_session %p %d -> %d\n", s,
+ atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
+ if (atomic_dec_and_test(&s->s_ref)) {
+ if (s->s_authorizer)
+ s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
+ s->s_mdsc->client->monc.auth, s->s_authorizer);
+ kfree(s);
+ }
+}
+
+/*
+ * called under mdsc->mutex
+ */
+struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *session;
+
+ if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+ return NULL;
+ session = mdsc->sessions[mds];
+ dout("lookup_mds_session %p %d\n", session,
+ atomic_read(&session->s_ref));
+ get_session(session);
+ return session;
+}
+
+static bool __have_session(struct ceph_mds_client *mdsc, int mds)
+{
+ if (mds >= mdsc->max_sessions)
+ return false;
+ return mdsc->sessions[mds];
+}
+
+static int __verify_registered_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ if (s->s_mds >= mdsc->max_sessions ||
+ mdsc->sessions[s->s_mds] != s)
+ return -ENOENT;
+ return 0;
+}
+
+/*
+ * create+register a new session for given mds.
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
+ int mds)
+{
+ struct ceph_mds_session *s;
+
+ s = kzalloc(sizeof(*s), GFP_NOFS);
+ s->s_mdsc = mdsc;
+ s->s_mds = mds;
+ s->s_state = CEPH_MDS_SESSION_NEW;
+ s->s_ttl = 0;
+ s->s_seq = 0;
+ mutex_init(&s->s_mutex);
+
+ ceph_con_init(mdsc->client->msgr, &s->s_con);
+ s->s_con.private = s;
+ s->s_con.ops = &mds_con_ops;
+ s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
+ s->s_con.peer_name.num = cpu_to_le64(mds);
+
+ spin_lock_init(&s->s_cap_lock);
+ s->s_cap_gen = 0;
+ s->s_cap_ttl = 0;
+ s->s_renew_requested = 0;
+ s->s_renew_seq = 0;
+ INIT_LIST_HEAD(&s->s_caps);
+ s->s_nr_caps = 0;
+ s->s_trim_caps = 0;
+ atomic_set(&s->s_ref, 1);
+ INIT_LIST_HEAD(&s->s_waiting);
+ INIT_LIST_HEAD(&s->s_unsafe);
+ s->s_num_cap_releases = 0;
+ s->s_cap_iterator = NULL;
+ INIT_LIST_HEAD(&s->s_cap_releases);
+ INIT_LIST_HEAD(&s->s_cap_releases_done);
+ INIT_LIST_HEAD(&s->s_cap_flushing);
+ INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
+
+ dout("register_session mds%d\n", mds);
+ if (mds >= mdsc->max_sessions) {
+ int newmax = 1 << get_count_order(mds+1);
+ struct ceph_mds_session **sa;
+
+ dout("register_session realloc to %d\n", newmax);
+ sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
+ if (sa == NULL)
+ goto fail_realloc;
+ if (mdsc->sessions) {
+ memcpy(sa, mdsc->sessions,
+ mdsc->max_sessions * sizeof(void *));
+ kfree(mdsc->sessions);
+ }
+ mdsc->sessions = sa;
+ mdsc->max_sessions = newmax;
+ }
+ mdsc->sessions[mds] = s;
+ atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
+
+ ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+
+ return s;
+
+fail_realloc:
+ kfree(s);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * called under mdsc->mutex
+ */
+static void __unregister_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *s)
+{
+ dout("__unregister_session mds%d %p\n", s->s_mds, s);
+ BUG_ON(mdsc->sessions[s->s_mds] != s);
+ mdsc->sessions[s->s_mds] = NULL;
+ ceph_con_close(&s->s_con);
+ ceph_put_mds_session(s);
+}
+
+/*
+ * drop session refs in request.
+ *
+ * should be last request ref, or hold mdsc->mutex
+ */
+static void put_request_session(struct ceph_mds_request *req)
+{
+ if (req->r_session) {
+ ceph_put_mds_session(req->r_session);
+ req->r_session = NULL;
+ }
+}
+
+void ceph_mdsc_release_request(struct kref *kref)
+{
+ struct ceph_mds_request *req = container_of(kref,
+ struct ceph_mds_request,
+ r_kref);
+ if (req->r_request)
+ ceph_msg_put(req->r_request);
+ if (req->r_reply) {
+ ceph_msg_put(req->r_reply);
+ destroy_reply_info(&req->r_reply_info);
+ }
+ if (req->r_inode) {
+ ceph_put_cap_refs(ceph_inode(req->r_inode),
+ CEPH_CAP_PIN);
+ iput(req->r_inode);
+ }
+ if (req->r_locked_dir)
+ ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
+ CEPH_CAP_PIN);
+ if (req->r_target_inode)
+ iput(req->r_target_inode);
+ if (req->r_dentry)
+ dput(req->r_dentry);
+ if (req->r_old_dentry) {
+ ceph_put_cap_refs(
+ ceph_inode(req->r_old_dentry->d_parent->d_inode),
+ CEPH_CAP_PIN);
+ dput(req->r_old_dentry);
+ }
+ kfree(req->r_path1);
+ kfree(req->r_path2);
+ put_request_session(req);
+ ceph_unreserve_caps(&req->r_caps_reservation);
+ kfree(req);
+}
+
+/*
+ * lookup session, bump ref if found.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
+ u64 tid)
+{
+ struct ceph_mds_request *req;
+ struct rb_node *n = mdsc->request_tree.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_mds_request, r_node);
+ if (tid < req->r_tid)
+ n = n->rb_left;
+ else if (tid > req->r_tid)
+ n = n->rb_right;
+ else {
+ ceph_mdsc_get_request(req);
+ return req;
+ }
+ }
+ return NULL;
+}
+
+static void __insert_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *new)
+{
+ struct rb_node **p = &mdsc->request_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_mds_request *req = NULL;
+
+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_mds_request, r_node);
+ if (new->r_tid < req->r_tid)
+ p = &(*p)->rb_left;
+ else if (new->r_tid > req->r_tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, &mdsc->request_tree);
+}
+
+/*
+ * Register an in-flight request, and assign a tid. Link to directory
+ * are modifying (if any).
+ *
+ * Called under mdsc->mutex.
+ */
+static void __register_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ req->r_tid = ++mdsc->last_tid;
+ if (req->r_num_caps)
+ ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+ dout("__register_request %p tid %lld\n", req, req->r_tid);
+ ceph_mdsc_get_request(req);
+ __insert_request(mdsc, req);
+
+ if (dir) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+
+ spin_lock(&ci->i_unsafe_lock);
+ req->r_unsafe_dir = dir;
+ list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+}
+
+static void __unregister_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ rb_erase(&req->r_node, &mdsc->request_tree);
+ ceph_mdsc_put_request(req);
+
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
+
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_dir_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+}
+
+/*
+ * Choose mds to send request to next. If there is a hint set in the
+ * request (e.g., due to a prior forward hint from the mds), use that.
+ * Otherwise, consult frag tree and/or caps to identify the
+ * appropriate mds. If all else fails, choose randomly.
+ *
+ * Called under mdsc->mutex.
+ */
+static int __choose_mds(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_cap *cap;
+ int mode = req->r_direct_mode;
+ int mds = -1;
+ u32 hash = req->r_direct_hash;
+ bool is_hash = req->r_direct_is_hash;
+
+ /*
+ * is there a specific mds we should try? ignore hint if we have
+ * no session and the mds is not up (active or recovering).
+ */
+ if (req->r_resend_mds >= 0 &&
+ (__have_session(mdsc, req->r_resend_mds) ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
+ dout("choose_mds using resend_mds mds%d\n",
+ req->r_resend_mds);
+ return req->r_resend_mds;
+ }
+
+ if (mode == USE_RANDOM_MDS)
+ goto random;
+
+ inode = NULL;
+ if (req->r_inode) {
+ inode = req->r_inode;
+ } else if (req->r_dentry) {
+ if (req->r_dentry->d_inode) {
+ inode = req->r_dentry->d_inode;
+ } else {
+ inode = req->r_dentry->d_parent->d_inode;
+ hash = req->r_dentry->d_name.hash;
+ is_hash = true;
+ }
+ }
+ dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
+ (int)hash, mode);
+ if (!inode)
+ goto random;
+ ci = ceph_inode(inode);
+
+ if (is_hash && S_ISDIR(inode->i_mode)) {
+ struct ceph_inode_frag frag;
+ int found;
+
+ ceph_choose_frag(ci, hash, &frag, &found);
+ if (found) {
+ if (mode == USE_ANY_MDS && frag.ndist > 0) {
+ u8 r;
+
+ /* choose a random replica */
+ get_random_bytes(&r, 1);
+ r %= frag.ndist;
+ mds = frag.dist[r];
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (%d/%d)\n",
+ inode, ceph_vinop(inode),
+ frag.frag, frag.mds,
+ (int)r, frag.ndist);
+ return mds;
+ }
+
+ /* since this file/dir wasn't known to be
+ * replicated, then we want to look for the
+ * authoritative mds. */
+ mode = USE_AUTH_MDS;
+ if (frag.mds >= 0) {
+ /* choose auth mds */
+ mds = frag.mds;
+ dout("choose_mds %p %llx.%llx "
+ "frag %u mds%d (auth)\n",
+ inode, ceph_vinop(inode), frag.frag, mds);
+ return mds;
+ }
+ }
+ }
+
+ spin_lock(&inode->i_lock);
+ cap = NULL;
+ if (mode == USE_AUTH_MDS)
+ cap = ci->i_auth_cap;
+ if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
+ cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
+ if (!cap) {
+ spin_unlock(&inode->i_lock);
+ goto random;
+ }
+ mds = cap->session->s_mds;
+ dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
+ inode, ceph_vinop(inode), mds,
+ cap == ci->i_auth_cap ? "auth " : "", cap);
+ spin_unlock(&inode->i_lock);
+ return mds;
+
+random:
+ mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
+ dout("choose_mds chose random mds%d\n", mds);
+ return mds;
+}
+
+
+/*
+ * session messages
+ */
+static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_session_head *h;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+ if (IS_ERR(msg)) {
+ pr_err("create_session_msg ENOMEM creating msg\n");
+ return ERR_PTR(PTR_ERR(msg));
+ }
+ h = msg->front.iov_base;
+ h->op = cpu_to_le32(op);
+ h->seq = cpu_to_le64(seq);
+ return msg;
+}
+
+/*
+ * send session open request.
+ *
+ * called under mdsc->mutex
+ */
+static int __open_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int mstate;
+ int mds = session->s_mds;
+ int err = 0;
+
+ /* wait for mds to go active? */
+ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
+ dout("open_session to mds%d (%s)\n", mds,
+ ceph_mds_state_name(mstate));
+ session->s_state = CEPH_MDS_SESSION_OPENING;
+ session->s_renew_requested = jiffies;
+
+ /* send connect message */
+ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
+ if (IS_ERR(msg)) {
+ err = PTR_ERR(msg);
+ goto out;
+ }
+ ceph_con_send(&session->s_con, msg);
+
+out:
+ return 0;
+}
+
+/*
+ * session caps
+ */
+
+/*
+ * Free preallocated cap messages assigned to this session
+ */
+static void cleanup_cap_releases(struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+
+ spin_lock(&session->s_cap_lock);
+ while (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ }
+ while (!list_empty(&session->s_cap_releases_done)) {
+ msg = list_first_entry(&session->s_cap_releases_done,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ }
+ spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * Helper to safely iterate over all caps associated with a session.
+ *
+ * caller must hold session s_mutex
+ */
+static int iterate_session_caps(struct ceph_mds_session *session,
+ int (*cb)(struct inode *, struct ceph_cap *,
+ void *), void *arg)
+{
+ struct list_head *p;
+ struct ceph_cap *cap;
+ struct inode *inode, *last_inode = NULL;
+ struct ceph_cap *old_cap = NULL;
+ int ret;
+
+ dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
+ spin_lock(&session->s_cap_lock);
+ p = session->s_caps.next;
+ while (p != &session->s_caps) {
+ cap = list_entry(p, struct ceph_cap, session_caps);
+ inode = igrab(&cap->ci->vfs_inode);
+ if (!inode) {
+ p = p->next;
+ continue;
+ }
+ session->s_cap_iterator = cap;
+ spin_unlock(&session->s_cap_lock);
+
+ if (last_inode) {
+ iput(last_inode);
+ last_inode = NULL;
+ }
+ if (old_cap) {
+ ceph_put_cap(old_cap);
+ old_cap = NULL;
+ }
+
+ ret = cb(inode, cap, arg);
+ last_inode = inode;
+
+ spin_lock(&session->s_cap_lock);
+ p = p->next;
+ if (cap->ci == NULL) {
+ dout("iterate_session_caps finishing cap %p removal\n",
+ cap);
+ BUG_ON(cap->session != session);
+ list_del_init(&cap->session_caps);
+ session->s_nr_caps--;
+ cap->session = NULL;
+ old_cap = cap; /* put_cap it w/o locks held */
+ }
+ if (ret < 0)
+ goto out;
+ }
+ ret = 0;
+out:
+ session->s_cap_iterator = NULL;
+ spin_unlock(&session->s_cap_lock);
+
+ if (last_inode)
+ iput(last_inode);
+ if (old_cap)
+ ceph_put_cap(old_cap);
+
+ return ret;
+}
+
+static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ dout("removing cap %p, ci is %p, inode is %p\n",
+ cap, ci, &ci->vfs_inode);
+ ceph_remove_cap(cap);
+ return 0;
+}
+
+/*
+ * caller must hold session s_mutex
+ */
+static void remove_session_caps(struct ceph_mds_session *session)
+{
+ dout("remove_session_caps on %p\n", session);
+ iterate_session_caps(session, remove_session_caps_cb, NULL);
+ BUG_ON(session->s_nr_caps > 0);
+ cleanup_cap_releases(session);
+}
+
+/*
+ * wake up any threads waiting on this session's caps. if the cap is
+ * old (didn't get renewed on the client reconnect), remove it now.
+ *
+ * caller must hold s_mutex.
+ */
+static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
+ void *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ wake_up(&ci->i_cap_wq);
+ if (arg) {
+ spin_lock(&inode->i_lock);
+ ci->i_wanted_max_size = 0;
+ ci->i_requested_max_size = 0;
+ spin_unlock(&inode->i_lock);
+ }
+ return 0;
+}
+
+static void wake_up_session_caps(struct ceph_mds_session *session,
+ int reconnect)
+{
+ dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
+ iterate_session_caps(session, wake_up_session_cb,
+ (void *)(unsigned long)reconnect);
+}
+
+/*
+ * Send periodic message to MDS renewing all currently held caps. The
+ * ack will reset the expiration for all caps from this session.
+ *
+ * caller holds s_mutex
+ */
+static int send_renew_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int state;
+
+ if (time_after_eq(jiffies, session->s_cap_ttl) &&
+ time_after_eq(session->s_cap_ttl, session->s_renew_requested))
+ pr_info("mds%d caps stale\n", session->s_mds);
+
+ /* do not try to renew caps until a recovering mds has reconnected
+ * with its clients. */
+ state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
+ if (state < CEPH_MDS_STATE_RECONNECT) {
+ dout("send_renew_caps ignoring mds%d (%s)\n",
+ session->s_mds, ceph_mds_state_name(state));
+ return 0;
+ }
+
+ dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
+ ceph_mds_state_name(state));
+ session->s_renew_requested = jiffies;
+ msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+/*
+ * Note new cap ttl, and any transition from stale -> not stale (fresh?).
+ *
+ * Called under session->s_mutex
+ */
+static void renewed_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, int is_renew)
+{
+ int was_stale;
+ int wake = 0;
+
+ spin_lock(&session->s_cap_lock);
+ was_stale = is_renew && (session->s_cap_ttl == 0 ||
+ time_after_eq(jiffies, session->s_cap_ttl));
+
+ session->s_cap_ttl = session->s_renew_requested +
+ mdsc->mdsmap->m_session_timeout*HZ;
+
+ if (was_stale) {
+ if (time_before(jiffies, session->s_cap_ttl)) {
+ pr_info("mds%d caps renewed\n", session->s_mds);
+ wake = 1;
+ } else {
+ pr_info("mds%d caps still stale\n", session->s_mds);
+ }
+ }
+ dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
+ session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
+ time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
+ spin_unlock(&session->s_cap_lock);
+
+ if (wake)
+ wake_up_session_caps(session, 0);
+}
+
+/*
+ * send a session close request
+ */
+static int request_close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ int err = 0;
+
+ dout("request_close_session mds%d state %s seq %lld\n",
+ session->s_mds, session_state_name(session->s_state),
+ session->s_seq);
+ msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ if (IS_ERR(msg))
+ err = PTR_ERR(msg);
+ else
+ ceph_con_send(&session->s_con, msg);
+ return err;
+}
+
+/*
+ * Called with s_mutex held.
+ */
+static int __close_session(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
+ return 0;
+ session->s_state = CEPH_MDS_SESSION_CLOSING;
+ return request_close_session(mdsc, session);
+}
+
+/*
+ * Trim old(er) caps.
+ *
+ * Because we can't cache an inode without one or more caps, we do
+ * this indirectly: if a cap is unused, we prune its aliases, at which
+ * point the inode will hopefully get dropped to.
+ *
+ * Yes, this is a bit sloppy. Our only real goal here is to respond to
+ * memory pressure from the MDS, though, so it needn't be perfect.
+ */
+static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
+{
+ struct ceph_mds_session *session = arg;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int used, oissued, mine;
+
+ if (session->s_trim_caps <= 0)
+ return -1;
+
+ spin_lock(&inode->i_lock);
+ mine = cap->issued | cap->implemented;
+ used = __ceph_caps_used(ci);
+ oissued = __ceph_caps_issued_other(ci, cap);
+
+ dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+ inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
+ ceph_cap_string(used));
+ if (ci->i_dirty_caps)
+ goto out; /* dirty caps */
+ if ((used & ~oissued) & mine)
+ goto out; /* we need these caps */
+
+ session->s_trim_caps--;
+ if (oissued) {
+ /* we aren't the only cap.. just remove us */
+ __ceph_remove_cap(cap);
+ } else {
+ /* try to drop referring dentries */
+ spin_unlock(&inode->i_lock);
+ d_prune_aliases(inode);
+ dout("trim_caps_cb %p cap %p pruned, count now %d\n",
+ inode, cap, atomic_read(&inode->i_count));
+ return 0;
+ }
+
+out:
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
+/*
+ * Trim session cap count down to some max number.
+ */
+static int trim_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int max_caps)
+{
+ int trim_caps = session->s_nr_caps - max_caps;
+
+ dout("trim_caps mds%d start: %d / %d, trim %d\n",
+ session->s_mds, session->s_nr_caps, max_caps, trim_caps);
+ if (trim_caps > 0) {
+ session->s_trim_caps = trim_caps;
+ iterate_session_caps(session, trim_caps_cb, session);
+ dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
+ session->s_mds, session->s_nr_caps, max_caps,
+ trim_caps - session->s_trim_caps);
+ session->s_trim_caps = 0;
+ }
+ return 0;
+}
+
+/*
+ * Allocate cap_release messages. If there is a partially full message
+ * in the queue, try to allocate enough to cover it's remainder, so that
+ * we can send it immediately.
+ *
+ * Called under s_mutex.
+ */
+static int add_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ int extra)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ int err = -ENOMEM;
+
+ if (extra < 0)
+ extra = mdsc->client->mount_args->cap_release_safety;
+
+ spin_lock(&session->s_cap_lock);
+
+ if (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg,
+ list_head);
+ head = msg->front.iov_base;
+ extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+ }
+
+ while (session->s_num_cap_releases < session->s_nr_caps + extra) {
+ spin_unlock(&session->s_cap_lock);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
+ 0, 0, NULL);
+ if (!msg)
+ goto out_unlocked;
+ dout("add_cap_releases %p msg %p now %d\n", session, msg,
+ (int)msg->front.iov_len);
+ head = msg->front.iov_base;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ spin_lock(&session->s_cap_lock);
+ list_add(&msg->list_head, &session->s_cap_releases);
+ session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
+ }
+
+ if (!list_empty(&session->s_cap_releases)) {
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg,
+ list_head);
+ head = msg->front.iov_base;
+ if (head->num) {
+ dout(" queueing non-full %p (%d)\n", msg,
+ le32_to_cpu(head->num));
+ list_move_tail(&msg->list_head,
+ &session->s_cap_releases_done);
+ session->s_num_cap_releases -=
+ CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+ }
+ }
+ err = 0;
+ spin_unlock(&session->s_cap_lock);
+out_unlocked:
+ return err;
+}
+
+/*
+ * flush all dirty inode data to disk.
+ *
+ * returns true if we've flushed through want_flush_seq
+ */
+static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+{
+ int mds, ret = 1;
+
+ dout("check_cap_flush want %lld\n", want_flush_seq);
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ struct ceph_mds_session *session = mdsc->sessions[mds];
+
+ if (!session)
+ continue;
+ get_session(session);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&session->s_mutex);
+ if (!list_empty(&session->s_cap_flushing)) {
+ struct ceph_inode_info *ci =
+ list_entry(session->s_cap_flushing.next,
+ struct ceph_inode_info,
+ i_flushing_item);
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&inode->i_lock);
+ if (ci->i_cap_flush_seq <= want_flush_seq) {
+ dout("check_cap_flush still flushing %p "
+ "seq %lld <= %lld to mds%d\n", inode,
+ ci->i_cap_flush_seq, want_flush_seq,
+ session->s_mds);
+ ret = 0;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+
+ if (!ret)
+ return ret;
+ mutex_lock(&mdsc->mutex);
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
+ return ret;
+}
+
+/*
+ * called under s_mutex
+ */
+static void send_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+
+ dout("send_cap_releases mds%d\n", session->s_mds);
+ while (1) {
+ spin_lock(&session->s_cap_lock);
+ if (list_empty(&session->s_cap_releases_done))
+ break;
+ msg = list_first_entry(&session->s_cap_releases_done,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+ spin_unlock(&session->s_cap_lock);
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
+ ceph_con_send(&session->s_con, msg);
+ }
+ spin_unlock(&session->s_cap_lock);
+}
+
+/*
+ * requests
+ */
+
+/*
+ * Create an mds request.
+ */
+struct ceph_mds_request *
+ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
+{
+ struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
+
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_started = jiffies;
+ req->r_resend_mds = -1;
+ INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ req->r_fmode = -1;
+ kref_init(&req->r_kref);
+ INIT_LIST_HEAD(&req->r_wait);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+
+ req->r_op = op;
+ req->r_direct_mode = mode;
+ return req;
+}
+
+/*
+ * return oldest (lowest) request, tid in request tree, 0 if none.
+ *
+ * called under mdsc->mutex.
+ */
+static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
+{
+ if (RB_EMPTY_ROOT(&mdsc->request_tree))
+ return NULL;
+ return rb_entry(rb_first(&mdsc->request_tree),
+ struct ceph_mds_request, r_node);
+}
+
+static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_request *req = __get_oldest_r