aboutsummaryrefslogtreecommitdiff
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Kconfig13
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/acl.c194
-rw-r--r--fs/ceph/addr.c110
-rw-r--r--fs/ceph/cache.c8
-rw-r--r--fs/ceph/cache.h23
-rw-r--r--fs/ceph/caps.c554
-rw-r--r--fs/ceph/debugfs.c11
-rw-r--r--fs/ceph/dir.c131
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c456
-rw-r--r--fs/ceph/inode.c527
-rw-r--r--fs/ceph/ioctl.c19
-rw-r--r--fs/ceph/locks.c93
-rw-r--r--fs/ceph/mds_client.c305
-rw-r--r--fs/ceph/mds_client.h8
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/strings.c3
-rw-r--r--fs/ceph/super.c36
-rw-r--r--fs/ceph/super.h72
-rw-r--r--fs/ceph/xattr.c163
21 files changed, 1997 insertions, 999 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index ac9a2ef5bb9..264e9bf83ff 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -25,3 +25,16 @@ config CEPH_FSCACHE
caching support for Ceph clients using FS-Cache
endif
+
+config CEPH_FS_POSIX_ACL
+ bool "Ceph POSIX Access Control Lists"
+ depends on CEPH_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 32e30106a2f..85a4230b9bf 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
debugfs.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 00000000000..469f2e8657e
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,194 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+ int type, struct posix_acl *acl)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ set_cached_acl(inode, type, acl);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
+ int type)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct posix_acl *acl = ACL_NOT_CACHED;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ acl = get_cached_acl(inode, type);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return acl;
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+ size = __ceph_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __ceph_getxattr(inode, name, value, size);
+ }
+
+ if (size > 0)
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ else if (size == -ERANGE || size == -ENODATA || size == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(-EIO);
+
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ ceph_set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret = 0, size = 0;
+ const char *name = NULL;
+ char *value = NULL;
+ struct iattr newattrs;
+ umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+ struct dentry *dentry;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &new_mode);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = acl ? -EINVAL : 0;
+ goto out;
+ }
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ dentry = d_find_alias(inode);
+ if (new_mode != old_mode) {
+ newattrs.ia_mode = new_mode;
+ newattrs.ia_valid = ATTR_MODE;
+ ret = ceph_setattr(dentry, &newattrs);
+ if (ret)
+ goto out_dput;
+ }
+
+ ret = __ceph_setxattr(dentry, name, value, size, 0);
+ if (ret) {
+ if (new_mode != old_mode) {
+ newattrs.ia_mode = old_mode;
+ newattrs.ia_valid = ATTR_MODE;
+ ceph_setattr(dentry, &newattrs);
+ }
+ goto out_dput;
+ }
+
+ ceph_set_cached_acl(inode, type, acl);
+
+out_dput:
+ dput(dentry);
+out_free:
+ kfree(value);
+out:
+ return ret;
+}
+
+int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *default_acl, *acl;
+ int error;
+
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ if (!default_acl && !acl)
+ cache_no_acl(inode);
+
+ if (default_acl) {
+ error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+ return error;
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd48142..90b3954d48e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,15 +209,17 @@ static int readpage_nounlock(struct file *filp, struct page *page)
err = 0;
if (err < 0) {
SetPageError(page);
+ ceph_fscache_readpage_cancel(inode, page);
goto out;
- } else if (err < PAGE_CACHE_SIZE) {
+ }
+ if (err < PAGE_CACHE_SIZE)
/* zero fill remainder of page */
zero_user_segment(page, err, PAGE_CACHE_SIZE);
- }
- SetPageUptodate(page);
+ else
+ flush_dcache_page(page);
- if (err == 0)
- ceph_readpage_to_fscache(inode, page);
+ SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
out:
return err < 0 ? err : 0;
@@ -252,6 +254,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
+ if (rc < 0)
+ goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
@@ -262,6 +266,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
flush_dcache_page(page);
SetPageUptodate(page);
ceph_readpage_to_fscache(inode, page);
+unlock:
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
@@ -686,7 +691,7 @@ static int ceph_writepages_start(struct address_space *mapping,
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- pr_warning("writepage_start %p on forced umount\n", inode);
+ pr_warn("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1179,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
* never get called.
*/
static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t pos)
{
WARN_ON(1);
return -EINVAL;
@@ -1203,6 +1208,41 @@ const struct address_space_operations ceph_aops = {
/*
* vm ops
*/
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+ int want, got, ret;
+
+ dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+ inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_CACHE;
+ while (1) {
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+ if (ret == 0)
+ break;
+ if (ret != -ERESTARTSYS) {
+ WARN_ON(1);
+ return VM_FAULT_SIGBUS;
+ }
+ }
+ dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+ inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+
+ ret = filemap_fault(vma, vmf);
+
+ dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+ inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+ ceph_put_cap_refs(ci, got);
+
+ return ret;
+}
/*
* Reuse write_begin here for simplicity.
@@ -1210,23 +1250,41 @@ const struct address_space_operations ceph_aops = {
static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
- struct page *page = vmf->page;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct page *page = vmf->page;
loff_t off = page_offset(page);
- loff_t size, len;
- int ret;
+ loff_t size = i_size_read(inode);
+ size_t len;
+ int want, got, ret;
- /* Update time before taking page lock */
- file_update_time(vma->vm_file);
-
- size = i_size_read(inode);
if (off + PAGE_CACHE_SIZE <= size)
len = PAGE_CACHE_SIZE;
else
len = size & ~PAGE_CACHE_MASK;
- dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
- off, len, page, page->index);
+ dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+ inode, ceph_vinop(inode), off, len, size);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+ while (1) {
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+ if (ret == 0)
+ break;
+ if (ret != -ERESTARTSYS) {
+ WARN_ON(1);
+ return VM_FAULT_SIGBUS;
+ }
+ }
+ dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+ inode, off, len, ceph_cap_string(got));
+
+ /* Update time before taking page lock */
+ file_update_time(vma->vm_file);
lock_page(page);
@@ -1248,14 +1306,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
}
out:
- dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
- if (ret != VM_FAULT_LOCKED)
+ if (ret != VM_FAULT_LOCKED) {
unlock_page(page);
+ } else {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+ inode, off, len, ceph_cap_string(got), ret);
+ ceph_put_cap_refs(ci, got);
+
return ret;
}
static struct vm_operations_struct ceph_vmops = {
- .fault = filemap_fault,
+ .fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e0b03..834f9f3723f 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
- fsc);
+ fsc, true);
if (fsc->fscache == NULL) {
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
@@ -204,7 +204,8 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
- ci);
+ ci, true);
+ fscache_check_consistency(ci->fscache);
done:
mutex_unlock(&inode->i_mutex);
@@ -324,6 +325,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ if (!PageFsCache(page))
+ return;
+
fscache_wait_on_page_write(ci->fscache, page);
fscache_uncache_page(ci->fscache, page);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba949408a33..5ac591bd012 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_attr_changed(ci->fscache);
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
@@ -67,6 +73,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
return fscache_maybe_release_page(ci->fscache, page, gfp);
}
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+ __fscache_uncache_page(ci->fscache, page);
+}
+
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
@@ -127,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
@@ -145,6 +163,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
return 1;
}
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+}
+
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 13976c33332..1fde164b74b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
{
struct ceph_cap *cap = NULL;
@@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* it is < 0. (This is so we can atomically add the cap and add an
* open file reference to it.)
*/
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
+void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap **new_cap)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
int mds = session->s_mds;
int actual_wanted;
@@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode,
if (fmode >= 0)
wanted |= ceph_caps_for_mode(fmode);
-retry:
- spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- new_cap = get_cap(mdsc, caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
+ cap = *new_cap;
+ *new_cap = NULL;
cap->issued = 0;
cap->implemented = 0;
@@ -555,21 +544,31 @@ retry:
cap->ci = ci;
__insert_cap_node(ci, cap);
- /* clear out old exporting info? (i.e. on cap import) */
- if (ci->i_cap_exporting_mds == mds) {
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- }
-
/* add to session cap list */
cap->session = session;
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
- } else if (new_cap)
- ceph_put_cap(mdsc, new_cap);
+ } else {
+ /*
+ * auth mds of the inode changed. we received the cap export
+ * message, but still haven't received the cap import message.
+ * handle_cap_export() updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+ * a message that was send before the cap import message. So
+ * don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != cap_id);
+ seq = cap->seq;
+ mseq = cap->mseq;
+ issued |= cap->issued;
+ flags |= CEPH_CAP_FLAG_AUTH;
+ }
+ }
if (!ci->i_snap_realm) {
/*
@@ -609,17 +608,12 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH) {
if (ci->i_auth_cap == NULL ||
- ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
ci->i_auth_cap = cap;
- } else if (ci->i_auth_cap == cap) {
- ci->i_auth_cap = NULL;
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n", inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ cap->mds_wanted = wanted;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ } else {
+ WARN_ON(ci->i_auth_cap == cap);
}
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -628,7 +622,7 @@ retry:
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
- if (mseq > cap->mseq)
+ if (ceph_seq_cmp(mseq, cap->mseq) > 0)
cap->mds_wanted = wanted;
else
cap->mds_wanted |= wanted;
@@ -639,9 +633,6 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- wake_up_all(&ci->i_cap_wq);
- return 0;
}
/*
@@ -676,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
- int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+ int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -816,7 +807,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (cap != ocap && __cap_is_valid(cap) &&
+ if (cap != ocap &&
(cap->implemented & ~cap->issued & mask))
return 1;
}
@@ -878,7 +869,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- mds_wanted |= cap->mds_wanted;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
}
return mds_wanted;
}
@@ -888,7 +882,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
*/
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+int ceph_is_any_caps(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_is_any_caps(ci);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ret;
}
/*
@@ -897,7 +903,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
* caller should hold i_ceph_lock.
* caller will not hold session s_mutex if called from destroy_inode.
*/
-void __ceph_remove_cap(struct ceph_cap *cap)
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
@@ -909,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)
/* remove from session list */
spin_lock(&session->s_cap_lock);
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect ||
+ cap->cap_gen == session->s_cap_gen))
+ __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
+ cap->mseq, cap->issue_seq);
+
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
dout("__ceph_remove_cap delaying %p removal from session %p\n",
@@ -1023,7 +1039,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
- spin_lock(&session->s_cap_lock);
BUG_ON(!session->s_num_cap_releases);
msg = list_first_entry(&session->s_cap_releases,
struct ceph_msg, list_head);
@@ -1052,7 +1067,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
(int)CEPH_CAPS_PER_RELEASE,
(int)msg->front.iov_len);
}
- spin_unlock(&session->s_cap_lock);
}
/*
@@ -1067,12 +1081,8 @@ void ceph_queue_caps_release(struct inode *inode)
p = rb_first(&ci->i_caps);
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
- struct ceph_mds_session *session = cap->session;
-
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
p = rb_next(p);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, true);
}
}
@@ -1379,13 +1389,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ci->i_snap_realm->cached_context);
dout(" inode %p now dirty snapc %p auth cap %p\n",
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ WARN_ON(!ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- if (ci->i_auth_cap)
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
- else
- list_add(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -1731,13 +1738,12 @@ ack:
/*
* Try to flush dirty caps back to the auth mds.
*/
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
- unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- int unlock_session = session ? 0 : 1;
int flushing = 0;
+ struct ceph_mds_session *session = NULL;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1751,13 +1757,14 @@ retry:
int want = __ceph_caps_wanted(ci);
int delayed;
- if (!session) {
+ if (!session || session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
+ if (session)
+ mutex_unlock(&session->s_mutex);
session = cap->session;
mutex_lock(&session->s_mutex);
goto retry;
}
- BUG_ON(session != cap->session);
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
@@ -1776,7 +1783,7 @@ retry:
out:
spin_unlock(&ci->i_ceph_lock);
out_unlocked:
- if (session && unlock_session)
+ if (session)
mutex_unlock(&session->s_mutex);
return flushing;
}
@@ -1861,7 +1868,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
mutex_lock(&inode->i_mutex);
- dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
/*
@@ -1896,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
dout("write_inode %p wait=%d\n", inode, wait);
if (wait) {
- dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
@@ -2346,11 +2353,11 @@ static void invalidate_aliases(struct inode *inode)
d_prune_aliases(inode);
/*
* For non-directory inode, d_find_alias() only returns
- * connected dentry. After calling d_invalidate(), the
- * dentry become disconnected.
+ * hashed dentry. After calling d_invalidate(), the
+ * dentry becomes unhashed.
*
* For directory inode, d_find_alias() can return
- * disconnected dentry. But directory inode should have
+ * unhashed dentry. But directory inode should have
* one alias at most.
*/
while ((dn = d_find_alias(inode))) {
@@ -2372,38 +2379,52 @@ static void invalidate_aliases(struct inode *inode)
* actually be a revocation if it specifies a smaller cap set.)
*
* caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
*/
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *grant,
+ void *snaptrace, int snaptrace_len,
+ struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(ci->i_ceph_lock)
+ struct ceph_cap *cap, int issued)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
+ int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime;
int check_caps = 0;
- int wake = 0;
- int writeback = 0;
- int queue_invalidate = 0;
- int deleted_inode = 0;
- int queue_revalidate = 0;
+ bool wake = 0;
+ bool writeback = 0;
+ bool queue_trunc = 0;
+ bool queue_invalidate = 0;
+ bool queue_revalidate = 0;
+ bool deleted_inode = 0;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
inode->i_size);
+
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
+
/*
* If CACHE is being revoked, and we have no dirty buffers,
* try to invalidate (once). (If there are dirty buffers, we
@@ -2425,15 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
/* side effects now are allowed */
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
cap->cap_gen = session->s_cap_gen;
+ cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2442,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
if (inode->i_nlink == 0 &&
(newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2460,6 +2480,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_buffer_put(ci->i_xattrs.blob);
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
ci->i_xattrs.version = version;
+ ceph_forget_all_cached_acls(inode);
}
}
@@ -2468,26 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
queue_revalidate = 1;
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
- /* max size increase? */
- if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
+ if (newcaps & CEPH_CAP_ANY_RD) {
+ /* ctime/mtime/atime? */
+ ceph_decode_timespec(&mtime, &grant->mtime);
+ ceph_decode_timespec(&atime, &grant->atime);
+ ceph_decode_timespec(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(grant->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ /* file layout may have changed */
+ ci->i_layout = grant->layout;
+ /* size/truncate_seq? */
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size),
+ size);
+ /* max size increase? */
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n",
+ ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = 1;
}
- wake = 1;
}
/* check cap bits */
@@ -2507,11 +2537,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
check_caps = 1;
}
- cap->seq = seq;
-
- /* file layout may have changed */
- ci->i_layout = grant->layout;
-
/* revocation, grant, or no-op? */
if (cap->issued & ~newcaps) {
int revoking = cap->issued & ~newcaps;
@@ -2553,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
spin_unlock(&ci->i_ceph_lock);
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len, false);
+ downgrade_write(&mdsc->snap_rwsem);
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ if (newcaps & ~issued)
+ wake = 1;
+ }
+
+ if (queue_trunc) {
+ ceph_queue_vmtruncate(inode);
+ ceph_queue_revalidate(inode);
+ } else if (queue_revalidate)
+ ceph_queue_revalidate(inode);
+
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2564,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode);
if (deleted_inode)
invalidate_aliases(inode);
- if (queue_revalidate)
- ceph_queue_revalidate(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
@@ -2737,123 +2777,200 @@ static void handle_cap_trunc(struct inode *inode,
* caller holds s_mutex
*/
static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
- struct ceph_mds_session *session,
- int *open_target_sessions)
+ struct ceph_mds_cap_peer *ph,
+ struct ceph_mds_session *session)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *tsession = NULL;
+ struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
+ u64 t_cap_id;
unsigned mseq = le32_to_cpu(ex->migrate_seq);
- struct ceph_cap *cap = NULL, *t;
- struct rb_node *p;
- int remember = 1;
+ unsigned t_seq, t_mseq;
+ int target, issued;
+ int mds = session->s_mds;
- dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
+ if (ph) {
+ t_cap_id = le64_to_cpu(ph->cap_id);
+ t_seq = le32_to_cpu(ph->seq);
+ t_mseq = le32_to_cpu(ph->mseq);
+ target = le32_to_cpu(ph->mds);
+ } else {
+ t_cap_id = t_seq = t_mseq = 0;
+ target = -1;
+ }
+ dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+ inode, ci, mds, mseq, target);
+retry:
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+ goto out_unlock;
- /* make sure we haven't seen a higher mseq */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- t = rb_entry(p, struct ceph_cap, ci_node);
- if (ceph_seq_cmp(t->mseq, mseq) > 0) {
- dout(" higher mseq on cap from mds%d\n",
- t->session->s_mds);
- remember = 0;
- }
- if (t->session->s_mds == mds)
- cap = t;
+ if (target < 0) {
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
}
- if (cap) {
- if (remember) {
- /* make note */
- ci->i_cap_exporting_mds = mds;
- ci->i_cap_exporting_mseq = mseq;
- ci->i_cap_exporting_issued = cap->issued;
-
- /*
- * make sure we have open sessions with all possible
- * export targets, so that we get the matching IMPORT
- */
- *open_target_sessions = 1;
+ /*
+ * now we know we haven't received the cap import message yet
+ * because the exported cap still exist.
+ */
- /*
- * we can't flush dirty caps that we've seen the
- * EXPORT but no IMPORT for
- */
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n",
- inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ issued = cap->issued;
+ WARN_ON(issued != cap->implemented);
+
+ tcap = __get_cap_for_mds(ci, target);
+ if (tcap) {
+ /* already have caps from the target */
+ if (tcap->cap_id != t_cap_id ||
+ ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+ dout(" updating import cap %p mds%d\n", tcap, target);
+ tcap->cap_id = t_cap_id;
+ tcap->seq = t_seq - 1;
+ tcap->issue_seq = t_seq - 1;
+ tcap->mseq = t_mseq;
+ tcap->issued |= issued;
+ tcap->implemented |= issued;
+ if (cap == ci->i_auth_cap)
+ ci->i_auth_cap = tcap;
+ if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
}
- spin_unlock(&mdsc->cap_dirty_lock);
}
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
+ } else if (tsession) {
+ /* add placeholder for the export tagert */
+ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+ ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
}
- /* else, we already released it */
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+
+ /* open target session */
+ tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+ if (!IS_ERR(tsession)) {
+ if (mds > target) {
+ mutex_lock(&session->s_mutex);
+ mutex_lock_nested(&tsession->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&tsession->s_mutex);
+ mutex_lock_nested(&session->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ }
+ ceph_add_cap_releases(mdsc, tsession);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ } else {
+ WARN_ON(1);
+ tsession = NULL;
+ target = -1;
+ }
+ goto retry;
+
+out_unlock:
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+ if (tsession) {
+ mutex_unlock(&tsession->s_mutex);
+ ceph_put_mds_session(tsession);
+ }
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
}
/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
+ * Handle cap IMPORT.
*
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
*/
static void handle_cap_import(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
+ struct ceph_cap **target_cap, int *old_issued)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
+ int issued;
+ unsigned caps = le32_to_cpu(im->caps);
unsigned wanted = le32_to_cpu(im->wanted);
unsigned seq = le32_to_cpu(im->seq);
unsigned mseq = le32_to_cpu(im->migrate_seq);
u64 realmino = le64_to_cpu(im->realm);
u64 cap_id = le64_to_cpu(im->cap_id);
+ u64 p_cap_id;
+ int peer;
- if (ci->i_cap_exporting_mds >= 0 &&
- ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d"
- " - cleared exporting from mds%d\n",
- inode, ci, mds, mseq,
- ci->i_cap_exporting_mds);
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
+ if (ph) {
+ p_cap_id = le64_to_cpu(ph->cap_id);
+ peer = le32_to_cpu(ph->mds);
+ } else {
+ p_cap_id = 0;
+ peer = -1;
+ }
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p back to cap_dirty\n", inode);
- list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+ inode, ci, mds, mseq, peer);
+
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (!new_cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ goto retry;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ cap = new_cap;
} else {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
+ if (new_cap) {
+ ceph_put_cap(mdsc, new_cap);
+ new_cap = NULL;
+ }
}
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+
+ ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+ ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+ if (ocap && ocap->cap_id == p_cap_id) {
+ dout(" remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
+ if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+ (ocap->seq != le32_to_cpu(ph->seq) ||
+ ocap->mseq != le32_to_cpu(ph->mseq))) {
+ pr_err("handle_cap_import: mismatched seq/mseq: "
+ "ino (%llx.%llx) mds%d seq %d mseq %d "
+ "importer mds%d has peer seq %d mseq %d\n",
+ ceph_vinop(inode), peer, ocap->seq,
+ ocap->mseq, mds, le32_to_cpu(ph->seq),
+ le32_to_cpu(ph->mseq));
+ }
+ __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ }
/* make sure we re-request max_size, if necessary */
- spin_lock(&ci->i_ceph_lock);
- ci->i_wanted_max_size = 0; /* reset */
+ ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
+
+ *old_issued = issued;
+ *target_cap = cap;
}
/*
@@ -2871,8 +2988,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_inode_info *ci;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
+ struct ceph_mds_cap_peer *peer = NULL;
int mds = session->s_mds;
- int op;
+ int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
u64 cap_id;
@@ -2881,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
void *snaptrace;
size_t snaptrace_len;
void *flock;
+ void *end;
u32 flock_len;
- int open_target_sessions = 0;
dout("handle_caps from mds%d\n", mds);
/* decode */
+ end = msg->front.iov_base + msg->front.iov_len;
tid = le64_to_cpu(msg->hdr.tid);
if (msg->front.iov_len < sizeof(*h))
goto bad;
@@ -2904,17 +3023,28 @@ void ceph_handle_caps(struct ceph_mds_session *session,
snaptrace_len = le32_to_cpu(h->snap_trace_len);
if (le16_to_cpu(msg->hdr.version) >= 2) {
- void *p, *end;
-
- p = snaptrace + snaptrace_len;
- end = msg->front.iov_base + msg->front.iov_len;
+ void *p = snaptrace + snaptrace_len;
ceph_decode_32_safe(&p, end, flock_len, bad);
+ if (p + flock_len > end)
+ goto bad;
flock = p;
} else {
flock = NULL;
flock_len = 0;
}
+ if (le16_to_cpu(msg->hdr.version) >= 3) {
+ if (op == CEPH_CAP_OP_IMPORT) {
+ void *p = flock + flock_len;
+ if (p + sizeof(*peer) > end)
+ goto bad;
+ peer = p;
+ } else if (op == CEPH_CAP_OP_EXPORT) {
+ /* recorded in unused fields */
+ peer = (void *)&h->size;
+ }
+ }
+
mutex_lock(&session->s_mutex);
session->s_seq++;
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2931,9 +3061,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
- if (op == CEPH_CAP_OP_IMPORT)
+ if (op == CEPH_CAP_OP_IMPORT) {
+ spin_lock(&session->s_cap_lock);
__queue_cap_release(session, vino.ino, cap_id,
mseq, seq);
+ spin_unlock(&session->s_cap_lock);
+ }
goto flush_cap_releases;
}
@@ -2944,12 +3077,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
goto done;
case CEPH_CAP_OP_EXPORT:
- handle_cap_export(inode, h, session, &open_target_sessions);
- goto done;
+ handle_cap_export(inode, h, peer, session);
+ goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
- handle_cap_import(mdsc, inode, h, session,
- snaptrace, snaptrace_len);
+ handle_cap_import(mdsc, inode, h, peer, session,
+ &cap, &issued);
+ handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ msg->middle, session, cap, issued);
+ goto done_unlocked;
}
/* the rest require a cap */
@@ -2966,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
- case CEPH_CAP_OP_IMPORT:
- handle_cap_grant(inode, h, session, cap, msg->middle);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+ handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
+ session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3000,8 +3138,6 @@ done:
done_unlocked:
if (inode)
iput(inode);
- if (open_target_sessions)
- ceph_mdsc_open_export_target_sessions(mdsc, session);
return;
bad:
@@ -3143,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
rel->seq = cpu_to_le32(cap->seq);
rel->issue_seq = cpu_to_le32(cap->issue_seq),
rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
+ rel->caps = cpu_to_le32(cap->implemented);
rel->wanted = cpu_to_le32(cap->mds_wanted);
rel->dname_len = 0;
rel->dname_seq = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa2..5a743ac141a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
if (req->r_got_unsafe)
- seq_printf(s, "\t(unsafe)");
+ seq_puts(s, "\t(unsafe)");
else
- seq_printf(s, "\t");
+ seq_puts(s, "\t");
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
path ? path : "");
@@ -116,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, " %s", req->r_path2);
}
- seq_printf(s, "\n");
+ seq_puts(s, "\n");
}
mutex_unlock(&mdsc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 868b61d56ca..c29d6ae6887 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p)
return p & 0xffffffff;
}
+static int fpos_cmp(loff_t l, loff_t r)
+{
+ int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+ if (v)
+ return v;
+ return (int)(fpos_off(l) - fpos_off(r));
+}
+
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
@@ -111,7 +119,8 @@ static unsigned fpos_off(loff_t p)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ u32 shared_gen)
{
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_dentry;
@@ -125,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
- last);
+ dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+ dir, shared_gen, ctx->pos, last);
spin_lock(&parent->d_lock);
/* start at beginning? */
if (ctx->pos == 2 || last == NULL ||
- ctx->pos < ceph_dentry(last)->offset) {
+ fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -153,10 +162,11 @@ more:
goto out_unlock;
}
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
+ if (di->lease_shared_gen == shared_gen &&
+ !d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- ctx->pos <= di->offset)
+ fpos_cmp(ctx->pos, di->offset) <= 0)
break;
dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
dentry->d_name.len, dentry->d_name.name, di->offset,
@@ -172,9 +182,16 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
+ /* make sure a dentry wasn't dropped while we didn't have parent lock */
+ if (!ceph_dir_is_complete(dir)) {
+ dout(" lost dir complete on %p; falling back to mds\n", dir);
+ dput(dentry);
+ err = -EAGAIN;
+ goto out;
+ }
+
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -182,25 +199,18 @@ more:
if (last) {
/* remember our position */
fi->dentry = last;
- fi->next_offset = di->offset;
+ fi->next_offset = fpos_off(di->offset);
}
dput(dentry);
return 0;
}
+ ctx->pos = di->offset + 1;
+
if (last)
dput(last);
last = dentry;
- ctx->pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- err = -EAGAIN;
- goto out;
- }
-
spin_lock(&parent->d_lock);
p = p->prev; /* advance to next dentry */
goto more;
@@ -244,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
@@ -283,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ u32 shared_gen = ci->i_shared_gen;
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(file, ctx);
+ err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
+ frag = fpos_frag(ctx->pos);
+ off = fpos_off(ctx->pos);
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -314,14 +325,16 @@ more:
fi->last_readdir = NULL;
}
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
req->r_inode = inode;
ihold(inode);
req->r_dentry = dget(file->f_dentry);
@@ -332,9 +345,6 @@ more:
req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
@@ -352,6 +362,16 @@ more:
}
/* note next offset and last dentry name */
+ rinfo = &req->r_reply_info;
+ if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2;
+ else
+ fi->next_offset = 0;
+ off = fi->next_offset;
+ }
+ fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
@@ -363,7 +383,6 @@ more:
else
fi->next_offset = 0;
} else {
- rinfo = &req->r_reply_info;
err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1],
rinfo->dir_dname_len[rinfo->dir_nr-1]);
@@ -429,7 +448,6 @@ more:
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
dout(" marking %p complete\n", inode);
__ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
@@ -437,7 +455,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -445,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi)
}
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2; /* compensate for . and .. */
+ else
+ fi->next_offset = 0;
if (fi->dentry) {
dput(fi->dentry);
fi->dentry = NULL;
@@ -457,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
+ loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
mutex_lock(&inode->i_mutex);
@@ -474,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
goto out;
}
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -487,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
* seek to new frag, or seek prior to current chunk.
*/
if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
+ reset_readdir(fi, fpos_frag(offset));
}
/* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
+ if (fpos_cmp(offset, old_offset) > 0)
fi->dir_release_count--;
}
out:
@@ -684,7 +705,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
- if (err)
+
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -722,7 +746,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
- if (err)
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -763,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
out:
- if (err < 0)
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -788,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry = dget(old_dentry);
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -887,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ ihold(old_dir);
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry_dir = old_dir;
req->r_locked_dir = new_dir;
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -908,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here.
*/
- /* d_move screws up d_subdirs order */
- ceph_dir_clear_complete(new_dir);
-
d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry);
+
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(old_dir);
+ ceph_dir_clear_complete(new_dir);
+
}
ceph_mdsc_put_request(req);
return err;
@@ -1028,14 +1058,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = 1;
} else if (dentry_lease_is_valid(dentry) ||
dir_lease_is_valid(dir, dentry)) {
- valid = 1;
+ if (dentry->d_inode)
+ valid = ceph_is_any_caps(dentry->d_inode);
+ else
+ valid = 1;
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid)
+ if (valid) {
ceph_dentry_lru_touch(dentry);
- else
+ } else {
+ ceph_dir_clear_complete(dir);
d_drop(dentry);
+ }
iput(dir);
return valid;
}
@@ -1284,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = {
.getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
.removexattr = ceph_removexattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
.mkdir = ceph_mkdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca..8d7d782f438 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
#include "mds_client.h"
/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
* Basic fh
*/
struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
} __attribute__ ((packed));
/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
- u32 parent_name_hash;
} __attribute__ ((packed));
-/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle. However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible. That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
- */
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct ceph_nfs_confh *cfh = (void *)rawfh;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
- struct dentry *dentry;
- struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- dentry = d_find_alias(inode);
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
- /* if we found an alias, generate a connectable fh */
- if (*max_len >= connected_handle_length && dentry) {
- dout("encode_fh %p connectable\n", dentry);
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ if (parent_inode) {
+ dout("encode_fh %llx with parent %llx\n",
+ ceph_ino(inode), ceph_ino(parent_inode));
cfh->ino = ceph_ino(inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
+ cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
- type = 2;
- spin_unlock(&dentry->d_lock);
- } else if (*max_len >= handle_length) {
- if (parent_inode) {
- /* nfsd wants connectable */
- *max_len = connected_handle_length;
- type = FILEID_INVALID;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(inode);
- *max_len = handle_length;
- type = 1;
- }
+ type = FILEID_INO32_GEN_PARENT;
} else {
+ dout("encode_fh %llx\n", ceph_ino(inode));
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
- type = FILEID_INVALID;
+ type = FILEID_INO32_GEN;
}
- if (dentry)
- dput(dentry);
return type;
}
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh, int fh_len)
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*fh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
+ vino.ino = ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+ dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
return dentry;
}
/*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
*/
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh, int fh_len)
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / 4)
+ return NULL;
+
+ dout("fh_to_dentry %llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
- struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
+ if (child) {
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
}
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+ child ? ceph_ino(child->d_inode) : ino,
+ dentry, ceph_vinop(inode));
return dentry;
}
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+static struct dentry *ceph_get_parent(struct dentry *child)
{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
- fh_len);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
- fh_len);
+ /* don't re-export snaps */
+ if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+ return ERR_PTR(-EINVAL);
+
+ dout("get_parent %p ino %llx.%llx\n",
+ child, ceph_vinop(child->d_inode));
+ return __get_parent(child->d_sb, child, 0);
}
/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
+ struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
struct dentry *dentry;
- int err;
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ return NULL;
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
+ dout("fh_to_parent %llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ int err;
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
+ mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ req->r_ino2 = ceph_vino(parent->d_inode);
+ req->r_locked_dir = parent->d_inode;
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+
+ if (!err) {
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ memcpy(name, rinfo->dname, rinfo->dname_len);
+ name[rinfo->dname_len] = 0;
+ dout("get_name %p ino %llx.%llx name %s\n",
+ child, ceph_vinop(child->d_inode), name);
+ } else {
+ dout("get_name %p ino %llx.%llx err %d\n",
+ child, ceph_vinop(child->d_inode), err);
}
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
+
+ ceph_mdsc_put_request(req);
+ return err;
}
const struct export_operations ceph_export_ops = {
.encode_fh = ceph_encode_fh,
.fh_to_dentry = ceph_fh_to_dentry,
.fh_to_parent = ceph_fh_to_parent,
+ .get_parent = ceph_get_parent,
+ .get_name = ceph_get_name,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de89829e2a..302085100c2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & (O_CREAT|O_TRUNC))
+ if (flags & O_CREAT)
parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
iput(parent_inode);
@@ -286,12 +286,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+ ceph_init_acl(dentry, dentry->d_inode, dir);
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened);
}
-
out_err:
+ if (!req->r_err && req->r_target_inode)
+ ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
dout("atomic_open result=%d\n", err);
return err;
@@ -408,51 +410,82 @@ more:
*
* If the read spans object boundary, just do multiple reads.
*/
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
- unsigned len, loff_t *poff, int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
+ int *checkeof)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct page **pages;
- u64 off = *poff;
+ u64 off = iocb->ki_pos;
int num_pages, ret;
+ size_t len = iov_iter_count(i);
- dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+ dout("sync_read on file %p %llu~%u %s\n", file, off,
+ (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_DIRECT) {
- num_pages = calc_pages_for((unsigned long)data, len);
- pages = ceph_get_direct_page_vector(data, num_pages, true);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
- }
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
/*
* flush any page cache pages in this range. this
* will make concurrent normal and sync io slow,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait(inode->i_mapping);
+ ret = filemap_write_and_wait_range(inode->i_mapping, off,
+ off + len);
if (ret < 0)
- goto done;
+ return ret;
- ret = striped_read(inode, off, len, pages, num_pages, checkeof,
- file->f_flags & O_DIRECT,
- (unsigned long)data & ~PAGE_MASK);
+ if (file->f_flags & O_DIRECT) {
+ while (iov_iter_count(i)) {
+ size_t start;
+ ssize_t n;
- if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
- if (ret >= 0)
- *poff = off + ret;
+ n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+ if (n < 0)
+ return n;
-done:
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, true);
- else
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ ret = striped_read(inode, off, n,
+ pages, num_pages, checkeof,
+ 1, start);
+
+ ceph_put_page_vector(pages, num_pages, true);
+
+ if (ret <= 0)
+ break;
+ off += ret;
+ iov_iter_advance(i, ret);
+ if (ret < n)
+ break;
+ }
+ } else {
+ num_pages = calc_pages_for(off, len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = striped_read(inode, off, len, pages,
+ num_pages, checkeof, 0, 0);
+ if (ret > 0) {
+ int l, k = 0;
+ size_t left = ret;
+
+ while (left) {
+ int copy = min_t(size_t, PAGE_SIZE, left);
+ l = copy_page_to_iter(pages[k++], 0, copy, i);
+ off += l;
+ left -= l;
+ if (l < copy)
+ break;
+ }
+ }
ceph_release_page_vector(pages, num_pages);
+ }
+
+ if (off > iocb->ki_pos) {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
+
dout("sync_read result %d\n", ret);
return ret;
}
@@ -489,148 +522,249 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+
/*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
+ * Synchronous write, straight from __user pointer or user pages.
*
* If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t pos, loff_t *ppos)
+static ssize_t
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
- int num_ops = 1;
struct page **pages;
int num_pages;
- u64 len;
int written = 0;
int flags;
int check_caps = 0;
- int page_align, io_align;
- unsigned long buf_align;
int ret;
struct timespec mtime = CURRENT_TIME;
- bool own_pages = false;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u %s\n", file, pos,
- (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ dout("sync_direct_write on file %p %lld~%u\n", file, pos,
+ (unsigned)count);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT,
- (pos + left) >> PAGE_CACHE_SHIFT);
+ (pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
- if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
- flags |= CEPH_OSD_FLAG_ACK;
- else
- num_ops++; /* Also include a 'startsync' command. */
-
- /*
- * we may need to do multiple writes here if we span an object
- * boundary. this isn't atomic, unfortunately. :(
- */
-more:
- io_align = pos & ~PAGE_MASK;
- buf_align = (unsigned long)data & ~PAGE_MASK;
- len = left;
- snapc = ci->i_snap_realm->cached_context;
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, num_ops,
- CEPH_OSD_OP_WRITE, flags, snapc,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ while (iov_iter_count(from) > 0) {
+ u64 len = iov_iter_single_seg_count(from);
+ size_t start;
+ ssize_t n;
+
+ snapc = ci->i_snap_realm->cached_context;
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &len,
+ 2,/*include a 'startsync' command*/
+ CEPH_OSD_OP_WRITE, flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
- /* write from beginning of first page, regardless of io alignment */
- page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
- num_pages = calc_pages_for(page_align, len);
- if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, false);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
+ n = iov_iter_get_pages_alloc(from, &pages, len, &start);
+ if (unlikely(n < 0)) {
+ ret = n;
+ ceph_osdc_put_request(req);
+ break;
}
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE-1));
- } else {
+ (pos+n) | (PAGE_CACHE_SIZE-1));
+ osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
+ false, false);
+
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ ceph_put_page_vector(pages, num_pages, false);
+
+ ceph_osdc_put_request(req);
+ if (ret)
+ break;
+ pos += n;
+ written += n;
+ iov_iter_advance(from, n);
+
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ }
+
+ if (ret != -EOLDSNAPC && written > 0) {
+ iocb->ki_pos = pos;
+ ret = written;
+ }
+ return ret;
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes. (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ struct page **pages;
+ u64 len;
+ int num_pages;
+ int written = 0;
+ int flags;
+ int check_caps = 0;
+ int ret;
+ struct timespec mtime = CURRENT_TIME;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+
+ if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ if (ret < 0)
+ return ret;
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + count) >> PAGE_CACHE_SHIFT);
+ if (ret < 0)
+ dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+ flags = CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ACK;
+
+ while ((len = iov_iter_count(from)) > 0) {
+ size_t left;
+ int n;
+
+ snapc = ci->i_snap_realm->cached_context;
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &len, 1,
+ CEPH_OSD_OP_WRITE, flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /*
+ * write from beginning of first page,
+ * regardless of io alignment
+ */
+ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
}
- ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+
+ left = len;
+ for (n = 0; n < num_pages; n++) {
+ size_t plen = min_t(size_t, left, PAGE_SIZE);
+ ret = copy_page_from_iter(pages[n], 0, plen, from);
+ if (ret != plen) {
+ ret = -EFAULT;
+ break;
+ }
+ left -= ret;
+ }
+
if (ret < 0) {
ceph_release_page_vector(pages, num_pages);
goto out;
}
- if ((file->f_flags & O_SYNC) == 0) {
- /* get a second commit callback */
- req->r_unsafe_callback = ceph_sync_write_unsafe;
- req->r_inode = inode;
- own_pages = true;
- }
- }
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, own_pages);
+ /* get a second commit callback */
+ req->r_unsafe_callback = ceph_sync_write_unsafe;
+ req->r_inode = inode;
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ false, true);
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, false);
- else if (file->f_flags & O_SYNC)
- ceph_release_page_vector(pages, num_pages);
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
out:
- ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
- left -= len;
- data += len;
- if (left)
- goto more;
+ ceph_osdc_put_request(req);
+ if (ret == 0) {
+ pos += len;
+ written += len;
+
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ } else
+ break;
+ }
+ if (ret != -EOLDSNAPC && written > 0) {
ret = written;
- *ppos = pos;
- if (pos > i_size_read(inode))
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
- NULL);
- } else if (ret != -EOLDSNAPC && written > 0) {
- ret = written;
+ iocb->ki_pos = pos;
}
return ret;
}
@@ -642,60 +776,69 @@ out:
*
* Hmm, the sync read case isn't actually async... should it be?
*/
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
- loff_t *ppos = &iocb->ki_pos;
- size_t len = iov->iov_len;
+ size_t len = iocb->ki_nbytes;
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- void __user *base = iov->iov_base;
ssize_t ret;
int want, got = 0;
int checkeof = 0, read = 0;
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), pos, (unsigned)len, inode);
again:
+ dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
if (ret < 0)
- goto out;
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
- ceph_cap_string(got));
+ return ret;
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (fi->flags & CEPH_F_SYNC))
+ (fi->flags & CEPH_F_SYNC)) {
+
+ dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+
/* hmm, this isn't really async... */
- ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
- else
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ ret = ceph_sync_read(iocb, to, &checkeof);
+ } else {
+ dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
-out:
+ ret = generic_file_read_iter(iocb, to);
+ }
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ int statret = ceph_do_getattr(inode,
+ CEPH_STAT_CAP_SIZE);
/* hit EOF or hole? */
- if (statret == 0 && *ppos < inode->i_size) {
- dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+ if (statret == 0 && iocb->ki_pos < inode->i_size &&
+ ret < len) {
+ dout("sync_read hit hole, ppos %lld < size %lld"
+ ", reading more\n", iocb->ki_pos,
+ inode->i_size);
+
+ iov_iter_advance(to, ret);
read += ret;
- base += ret;
len -= ret;
checkeof = 0;
goto again;
}
}
+
if (ret >= 0)
ret += read;
@@ -712,8 +855,7 @@ out:
*
* If we are near ENOSPC, write synchronously.
*/
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct ceph_file_info *fi = file->private_data;
@@ -721,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
- ssize_t count, written = 0;
+ ssize_t count = iov_iter_count(from), written = 0;
int err, want, got;
+ loff_t pos = iocb->ki_pos;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- goto out;
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = file->f_mapping->backing_dev_info;
@@ -742,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
err = file_remove_suid(file);
if (err)
@@ -772,20 +912,27 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (fi->flags & CEPH_F_SYNC)) {
+ (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct iov_iter data;
mutex_unlock(&inode->i_mutex);
- written = ceph_sync_write(file, iov->iov_base, count,
- pos, &iocb->ki_pos);
+ /* we might need to revert back to that point */
+ data = *from;
+ if (file->f_flags & O_DIRECT)
+ written = ceph_sync_direct_write(iocb, &data);
+ else
+ written = ceph_sync_write(iocb, &data);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode),
- pos, (unsigned)iov->iov_len);
+ pos, (unsigned)count);
mutex_lock(&inode->i_mutex);
goto retry_snap;
}
+ if (written > 0)
+ iov_iter_advance(from, written);
} else {
+ loff_t old_size = inode->i_size;
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -793,9 +940,11 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, &iocb->ki_pos,
- count, 0);
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ if (inode->i_size > old_size)
+ ceph_fscache_update_objectsize(inode);
mutex_unlock(&inode->i_mutex);
}
@@ -809,7 +958,7 @@ retry_snap:
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ inode, ceph_vinop(inode), pos, (unsigned)count,
ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
@@ -1018,7 +1167,7 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t offset, loff_t length)
{
struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
@@ -1031,9 +1180,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1089,16 +1235,16 @@ const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = ceph_read_iter,
+ .write_iter = ceph_write_iter,
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8549a48115f..04c89c266ce 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,6 +9,8 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
@@ -95,6 +97,8 @@ const struct inode_operations ceph_file_iops = {
.getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
.removexattr = ceph_removexattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
};
@@ -176,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
* specified, copy the frag delegation info to the caller if
* it is present.
*/
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
{
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
@@ -188,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
if (found)
*found = 0;
- mutex_lock(&ci->i_fragtree_mutex);
while (1) {
WARN_ON(!ceph_frag_contains_value(t, v));
frag = __ceph_find_frag(ci, t);
@@ -217,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
dout("choose_frag(%x) = %x\n", v, t);
- mutex_unlock(&ci->i_fragtree_mutex);
return t;
}
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 ret;
+ mutex_lock(&ci->i_fragtree_mutex);
+ ret = __ceph_choose_frag(ci, v, pfrag, found);
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return ret;
+}
+
/*
* Process dirfrag (delegation) info from the mds. Include leaf
* fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -234,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
int ndist = le32_to_cpu(dirinfo->ndist);
+ int diri_auth = -1;
int i;
int err = 0;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ diri_auth = ci->i_auth_cap->mds;
+ spin_unlock(&ci->i_ceph_lock);
+
mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
+ if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
frag = __ceph_find_frag(ci, id);
if (!frag)
@@ -283,6 +300,75 @@ out:
return err;
}
+static int ceph_fill_fragtree(struct inode *inode,
+ struct ceph_frag_tree_head *fragtree,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ struct rb_node *rb_node;
+ int i;
+ u32 id, nsplits;
+ bool update = false;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ nsplits = le32_to_cpu(fragtree->nsplits);
+ if (nsplits) {
+ i = prandom_u32() % nsplits;
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ if (!__ceph_find_frag(ci, id))
+ update = true;
+ } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+ rb_node = rb_first(&ci->i_fragtree);
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+ update = true;
+ }
+ if (!update && dirinfo) {
+ id = le32_to_cpu(dirinfo->frag);
+ if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+ update = true;
+ }
+ if (!update)
+ goto out_unlock;
+
+ dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ rb_node = rb_first(&ci->i_fragtree);
+ for (i = 0; i < nsplits; i++) {
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ frag = NULL;
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (ceph_frag_compare(frag->frag, id) >= 0) {
+ if (frag->frag != id)
+ frag = NULL;
+ else
+ rb_node = rb_next(rb_node);
+ break;
+ }
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ frag = NULL;
+ }
+ if (!frag) {
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag))
+ continue;
+ }
+ frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+ dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ }
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ }
+out_unlock:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return 0;
+}
/*
* initialize a newly allocated inode.
@@ -335,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
- ci->i_cap_exporting_mds = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_issued = 0;
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
@@ -406,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
/*
* we may still have a snap_realm reference if there are stray
- * caps in i_cap_exporting_issued or i_snap_caps.
+ * caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
@@ -436,6 +519,16 @@ void ceph_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, ceph_i_callback);
}
+int ceph_drop_inode(struct inode *inode)
+{
+ /*
+ * Positve dentry and corresponding inode are always accompanied
+ * in MDS reply. So no need to keep inode in the cache after
+ * dropping all its aliases.
+ */
+ return 1;
+}
+
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -571,20 +664,26 @@ static int fill_inode(struct inode *inode,
unsigned long ttl_from, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued = 0, implemented;
+ int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
- u32 nsplits;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_cap *new_cap = NULL;
int err = 0;
- int queue_trunc = 0;
+ bool wake = false;
+ bool queue_trunc = false;
+ bool new_version = false;
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* prealloc new cap struct */
+ if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+ new_cap = ceph_get_cap(mdsc, caps_reservation);
+
/*
* prealloc xattr data, if it looks like we'll need it. only
* if len > 4 (meaning there are actually xattrs; the first 4
@@ -610,19 +709,23 @@ static int fill_inode(struct inode *inode,
* 3 2 skip
* 3 3 update
*/
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) >= le64_to_cpu(info->version))
- goto no_change;
-
+ if (ci->i_version == 0 ||
+ ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ le64_to_cpu(info->version) > (ci->i_version & ~1)))
+ new_version = true;
+
issued = __ceph_caps_issued(ci, &implemented);
issued |= implemented | __ceph_caps_dirty(ci);
+ new_issued = ~issued & le32_to_cpu(info->cap.caps);
/* update inode */
ci->i_version = le64_to_cpu(info->version);
inode->i_version++;
inode->i_rdev = le32_to_cpu(info->rdev);
+ inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -631,31 +734,35 @@ static int fill_inode(struct inode *inode,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec(&atime, &info->atime);
+ ceph_decode_timespec(&mtime, &info->mtime);
+ ceph_decode_timespec(&ctime, &info->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
}
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ ci->i_layout = info->layout;
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ le64_to_cpu(info->size));
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+ }
/* xattrs */
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -668,6 +775,7 @@ static int fill_inode(struct inode *inode,
memcpy(ci->i_xattrs.blob->vec.iov_base,
iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ ceph_forget_all_cached_acls(inode);
xattr_blob = NULL;
}
@@ -738,29 +846,7 @@ static int fill_inode(struct inode *inode,
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
- ci->i_max_offset = 2;
- }
-no_change:
- spin_unlock(&ci->i_ceph_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
- if (IS_ERR(frag))
- continue;
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
}
- mutex_unlock(&ci->i_fragtree_mutex);
/* were we issued a capability? */
if (info->cap.caps) {
@@ -773,30 +859,41 @@ no_change:
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
+ info->cap.flags, &new_cap);
+ wake = true;
} else {
- spin_lock(&ci->i_ceph_lock);
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(le32_to_cpu(info->cap.caps)));
ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
if (cap_fmode >= 0)
__ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&ci->i_ceph_lock);
}
} else if (cap_fmode >= 0) {
- pr_warning("mds issued no caps on %llx.%llx\n",
+ pr_warn("mds issued no caps on %llx.%llx\n",
ceph_vinop(inode));
__ceph_get_fmode(ci, cap_fmode);
}
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ /* populate frag tree */
+ if (S_ISDIR(inode->i_mode))
+ ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
/* update delegation info? */
if (dirinfo)
ceph_fill_dirfrag(inode, dirinfo);
err = 0;
-
out:
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
return err;
@@ -853,41 +950,6 @@ out_unlock:
}
/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci;
- struct ceph_dentry_info *di;
-
- BUG_ON(!inode);
-
- ci = ceph_inode(inode);
- di = ceph_dentry(dn);
-
- spin_lock(&ci->i_ceph_lock);
- if (!__ceph_dir_is_complete(ci)) {
- spin_unlock(&ci->i_ceph_lock);
- return;
- }
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&ci->i_ceph_lock);
-
- spin_lock(&dir->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &dir->d_subdirs);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dir->d_lock);
-}
-
-/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*
@@ -896,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
* the caller) if we fail.
*/
static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash, bool set_offset)
+ bool *prehash)
{
struct dentry *realdn;
@@ -928,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
}
if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn);
- if (set_offset)
- ceph_set_dentry_offset(dn);
out:
return dn;
}
@@ -950,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
struct ceph_vino vino;
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- int i = 0;
int err = 0;
dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -1008,10 +1066,82 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
session, req->r_request_started, -1,
&req->r_caps_reservation);
if (err < 0)
- return err;
+ goto done;
} else {
WARN_ON_ONCE(1);
}
+
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ struct qstr dname;
+ struct dentry *dn, *parent;
+
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+ parent = d_find_any_alias(dir);
+ BUG_ON(!parent);
+
+ dname.name = rinfo->dname;
+ dname.len = rinfo->dname_len;
+ dname.hash = full_name_hash(dname.name, dname.len);
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (dn == NULL) {
+ dput(parent);
+ err = -ENOMEM;
+ goto done;
+ }
+ err = ceph_init_dentry(dn);
+ if (err < 0) {
+ dput(dn);
+ dput(parent);
+ goto done;
+ }
+ } else if (dn->d_inode &&
+ (ceph_ino(dn->d_inode) != vino.ino ||
+ ceph_snap(dn->d_inode) != vino.snap)) {
+ dout(" dn %p points to wrong inode %p\n",
+ dn, dn->d_inode);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+
+ req->r_dentry = dn;
+ dput(parent);
+ }
+ }
+
+ if (rinfo->head->is_target) {
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+ in = ceph_get_inode(sb, vino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto done;
+ }
+ req->r_target_inode = in;
+
+ err = fill_inode(in, &rinfo->targeti, NULL,
+ session, req->r_request_started,
+ (!req->r_aborted && rinfo->head->result == 0) ?
+ req->r_fmode : -1,
+ &req->r_caps_reservation);
+ if (err < 0) {
+ pr_err("fill_inode badness %p %llx.%llx\n",
+ in, ceph_vinop(in));
+ goto done;
+ }
}
/*
@@ -1053,6 +1183,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+ struct inode *olddir = req->r_old_dentry_dir;
+ BUG_ON(!olddir);
+
dout(" src %p '%.*s' dst %p '%.*s'\n",
req->r_old_dentry,
req->r_old_dentry->d_name.len,
@@ -1072,18 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /*
- * d_move() puts the renamed dentry at the end of
- * d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when dir is
- * complete.
- */
- ceph_set_dentry_offset(req->r_old_dentry);
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(dir);
+ ceph_dir_clear_complete(olddir);
+
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
dn = req->r_old_dentry; /* use old_dentry */
- in = dn->d_inode;
}
/* null dentry? */
@@ -1105,99 +1234,46 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
}
/* attach proper inode */
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = dn->d_inode;
- if (!in) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_trace bad get_inode "
- "%llx.%llx\n", vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_drop(dn);
- goto done;
- }
- dn = splice_dentry(dn, in, &have_lease, true);
+ if (!dn->d_inode) {
+ ceph_dir_clear_complete(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in, &have_lease);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
}
req->r_dentry = dn; /* may have spliced */
- ihold(in);
- } else if (ceph_ino(in) == vino.ino &&
- ceph_snap(in) == vino.snap) {
- ihold(in);
- } else {
+ } else if (dn->d_inode && dn->d_inode != in) {
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, in, ceph_ino(in), ceph_snap(in),
- vino.ino, vino.snap);
+ dn, dn->d_inode, ceph_vinop(dn->d_inode),
+ ceph_vinop(in));
have_lease = false;
- in = NULL;
}
if (have_lease)
update_dentry_lease(dn, rinfo->dlease, session,
req->r_request_started);
dout(" final dn %p\n", dn);
- i++;
- } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
+ } else if (!req->r_aborted &&
+ (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP)) {
struct dentry *dn = req->r_dentry;
+ struct inode *dir = req->r_locked_dir;
/* fill out a snapdir LOOKUPSNAP dentry */
BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_inode get_inode badness %llx.%llx\n",
- vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
+ BUG_ON(!dir);
+ BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
dout(" linking snapped dir %p to dn %p\n", in, dn);
- dn = splice_dentry(dn, in, NULL, true);
+ ceph_dir_clear_complete(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in, NULL);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
}
req->r_dentry = dn; /* may have spliced */
- ihold(in);
- rinfo->head->is_dentry = 1; /* fool notrace handlers */
- }
-
- if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
- if (in == NULL || ceph_ino(in) != vino.ino ||
- ceph_snap(in) != vino.snap) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
- }
- req->r_target_inode = in;
-
- err = fill_inode(in,
- &rinfo->targeti, NULL,
- session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
- req->r_fmode : -1,
- &req->r_caps_reservation);
- if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
- goto done;
- }
}
-
done:
dout("fill_trace done err=%d\n", err);
return err;
@@ -1247,11 +1323,23 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct qstr dname;
struct dentry *dn;
struct inode *in;
- int err = 0, i;
+ int err = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u64 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_dentry_info *di;
+ u64 r_readdir_offset = req->r_readdir_offset;
+ u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+
+ if (rinfo->dir_dir &&
+ le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ dout("readdir_prepopulate got new frag %x -> %x\n",
+ frag, le32_to_cpu(rinfo->dir_dir->frag));
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (ceph_frag_is_leftmost(frag))
+ r_readdir_offset = 2;
+ else
+ r_readdir_offset = 0;
+ }
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
@@ -1268,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
}
+ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
@@ -1292,9 +1381,10 @@ retry_lookup:
err = -ENOMEM;
goto out;
}
- err = ceph_init_dentry(dn);
- if (err < 0) {
+ ret = ceph_init_dentry(dn);
+ if (ret < 0) {
dput(dn);
+ err = ret;
goto out;
}
} else if (dn->d_inode &&
@@ -1314,9 +1404,6 @@ retry_lookup:
spin_unlock(&parent->d_lock);
}
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
/* inode */
if (dn->d_inode) {
in = dn->d_inode;
@@ -1329,26 +1416,39 @@ retry_lookup:
err = PTR_ERR(in);
goto out;
}
- dn = splice_dentry(dn, in, NULL, false);
- if (IS_ERR(dn))
- dn = NULL;
}
if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
pr_err("fill_inode badness on %p\n", in);
+ if (!dn->d_inode)
+ iput(in);
+ d_drop(dn);
goto next_item;
}
- if (dn)
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
- req->r_request_started);
+
+ if (!dn->d_inode) {
+ dn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ dn = NULL;
+ goto next_item;
+ }
+ }
+
+ di = dn->d_fsdata;
+ di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+
+ update_dentry_lease(dn, rinfo->dir_dlease[i],
+ req->r_session,
+ req->r_request_started);
next_item:
if (dn)
dput(dn);
}
- req->r_did_prepopulate = true;
+ if (err == 0)
+ req->r_did_prepopulate = true;
out:
if (snapdir) {
@@ -1437,7 +1537,8 @@ static void ceph_invalidate_work(struct work_struct *work)
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- /* nevermind! */
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
goto out;
@@ -1445,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, 0);
+ truncate_pagecache(inode, 0);
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1458,13 +1559,14 @@ static void ceph_invalidate_work(struct work_struct *work)
dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
inode, orig_gen, ci->i_rdcache_gen,
ci->i_rdcache_revoking);
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
}
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
-
+out:
if (check)
ceph_check_caps(ci, 0, NULL);
-out:
iput(inode);
}
@@ -1547,7 +1649,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, to);
+ truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
@@ -1594,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1658,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
attr->ia_mode != inode->i_mode) {
+ inode->i_mode = attr->ia_mode;
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
mask |= CEPH_SETATTR_MODE;
release |= CEPH_CAP_AUTH_SHARED;
@@ -1773,15 +1875,19 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
+ if (ia_valid & ATTR_MODE) {
+ err = posix_acl_chmod(inode, attr->ia_mode);
+ if (err)
+ goto out_put;
+ }
+
if (mask) {
req->r_inode = inode;
ihold(inode);
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
ceph_cap_string(dirtied), mask);
@@ -1792,6 +1898,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
return err;
out:
spin_unlock(&ci->i_ceph_lock);
+out_put:
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 669622fd1ae..a822a6e5829 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
+#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
#include "super.h"
#include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
#include "ioctl.h"
@@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct inode *parent_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
@@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
+
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
req->r_args.setlayout.layout.fl_stripe_unit =
@@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
@@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
@@ -183,6 +182,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_object_locator oloc;
+ struct ceph_object_id oid;
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -211,8 +212,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
- ceph_file_layout_pg_pool(ci->i_layout));
+ oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+ ceph_oid_set_name(&oid, dl.object_name);
+
+ r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
if (r < 0) {
up_read(&osdc->map_sem);
return r;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0..fbc39c47bac 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
#include <linux/file.h>
#include <linux/namei.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/pagelist.h>
+static u64 lock_secret;
+
+static inline u64 secure_addr(void *addr)
+{
+ u64 v = lock_secret ^ (u64)(unsigned long)addr;
+ /*
+ * Set the most significant bit, so that MDS knows the 'owner'
+ * is sufficient to identify the owner of lock. (old code uses
+ * both 'owner' and 'pid')
+ */
+ v |= (1ULL << 63);
+ return v;
+}
+
+void __init ceph_flock_init(void)
+{
+ get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
/**
* Implement fcntl and flock locking functions.
*/
@@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
u64 length = 0;
+ u64 owner;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
/* mds requires start and length rather than start and end */
if (LLONG_MAX == fl->fl_end)
@@ -32,25 +53,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
else
length = fl->fl_end - fl->fl_start + 1;
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type);
+ owner = secure_addr(fl->fl_owner);
+
+ dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+ (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+ wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
+ req->r_args.filelock_change.owner = cpu_to_le64(owner);
req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
- /* This should be adjusted, but I'm not sure if
- namespaces actually get id numbers*/
- req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
- if ( operation == CEPH_MDS_OP_GETFILELOCK){
+ if (operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
@@ -87,14 +107,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_lock, fl_owner: %p", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
- if (F_SETLKW == cmd)
- wait = 1;
- if (F_GETLK == cmd)
+ if (IS_GETLK(cmd))
op = CEPH_MDS_OP_GETFILELOCK;
+ else if (IS_SETLKW(cmd))
+ wait = 1;
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +130,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
- if ( op != CEPH_MDS_OP_GETFILELOCK ){
+ if (op != CEPH_MDS_OP_GETFILELOCK) {
dout("mds locked, locking locally");
err = posix_lock_file(file, fl, NULL);
if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +156,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
u8 lock_cmd;
int err;
- u8 wait = 1;
-
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_flock, fl_pid:%d", fl->fl_pid);
-
- /* set wait bit, then clear it out of cmd*/
- if (cmd & LOCK_NB)
- wait = 0;
- cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
- /* set command sequence that Ceph wants to see:
- shared lock, exclusive lock, or unlock */
- if (LOCK_SH == cmd)
+ u8 wait = 0;
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_flock, fl_file: %p", fl->fl_file);
+
+ if (IS_SETLKW(cmd))
+ wait = 1;
+
+ if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
- else if (LOCK_EX == cmd)
+ else if (F_WRLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock,
struct ceph_filelock *cephlock)
{
int err = 0;
-
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace =
- cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+ cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
switch (lock->fl_type) {
case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b7bda5d9611..92a2548278f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -43,6 +44,7 @@
*/
struct ceph_reconnect_state {
+ int nr_caps;
struct ceph_pagelist *pagelist;
bool flock;
};
@@ -62,7 +64,7 @@ static const struct ceph_connection_operations mds_con_ops;
*/
static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
- int features)
+ u64 features)
{
int err = -EIO;
@@ -97,7 +99,7 @@ bad:
*/
static int parse_reply_info_trace(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
int err;
@@ -144,7 +146,7 @@ out_bad:
*/
static int parse_reply_info_dir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
u32 num, i = 0;
int err;
@@ -164,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
if (num == 0)
goto done;
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
- }
+ BUG_ON(!info->dir_in);
info->dir_dname = (void *)(info->dir_in + num);
info->dir_dname_len = (void *)(info->dir_dname + num);
info->dir_dlease = (void *)(info->dir_dname_len + num);
+ if ((unsigned long)(info->dir_dlease + num) >
+ (unsigned long)info->dir_in + info->dir_buf_size) {
+ pr_err("dir contents are larger than expected\n");
+ WARN_ON(1);
+ goto bad;
+ }
+ info->dir_nr = num;
while (num) {
/* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -216,7 +215,7 @@ out_bad:
*/
static int parse_reply_info_filelock(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (*p + sizeof(*info->filelock_reply) > end)
goto bad;
@@ -237,7 +236,7 @@ bad:
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
if (*p == end) {
@@ -261,7 +260,7 @@ bad:
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
@@ -279,7 +278,7 @@ static int parse_reply_info_extra(void **p, void *end,
*/
static int parse_reply_info(struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
void *p, *end;
u32 len;
@@ -326,7 +325,9 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- kfree(info->dir_in);
+ if (!info->dir_in)
+ return;
+ free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
}
@@ -443,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0;
+ s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_releases_done);
@@ -510,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode);
@@ -526,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
- if (req->r_old_dentry) {
+ if (req->r_old_dentry)
+ dput(req->r_old_dentry);
+ if (req->r_old_dentry_dir) {
/*
* track (and drop pins for) r_old_dentry_dir
* separately, since r_old_dentry's d_parent may have
@@ -535,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- dput(req->r_old_dentry);
iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
@@ -642,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
req->r_unsafe_dir = NULL;
}
+ complete_all(&req->r_safe_completion);
+
ceph_mdsc_put_request(req);
}
@@ -709,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
struct dentry *dn = get_nonsnap_parent(parent);
inode = dn->d_inode;
dout("__choose_mds using nonsnap parent %p\n", inode);
- } else if (req->r_dentry->d_inode) {
+ } else {
/* dentry target */
inode = req->r_dentry->d_inode;
- } else {
- /* dir + name */
- inode = dir;
- hash = ceph_dentry_hash(dir, req->r_dentry);
- is_hash = true;
+ if (!inode || mode == USE_AUTH_MDS) {
+ /* dir + name */
+ inode = dir;
+ hash = ceph_dentry_hash(dir, req->r_dentry);
+ is_hash = true;
+ }
}
}
@@ -842,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
*
* called under mdsc->mutex
*/
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ session = __ceph_lookup_mds_session(mdsc, target);
+ if (!session) {
+ session = register_session(mdsc, target);
+ if (IS_ERR(session))
+ return session;
+ }
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING)
+ __open_session(mdsc, session);
+
+ return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ dout("open_export_target_session to mds%d\n", target);
+
+ mutex_lock(&mdsc->mutex);
+ session = __open_export_target_session(mdsc, target);
+ mutex_unlock(&mdsc->mutex);
+
+ return session;
+}
+
static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_info *mi;
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- int target;
if (mds >= mdsc->mdsmap->m_max_mds)
return;
+
mi = &mdsc->mdsmap->m_info[mds];
dout("open_export_target_sessions for mds%d (%d targets)\n",
session->s_mds, mi->num_export_targets);
for (i = 0; i < mi->num_export_targets; i++) {
- target = mi->export_targets[i];
- ts = __ceph_lookup_mds_session(mdsc, target);
- if (!ts) {
- ts = register_session(mdsc, target);
- if (IS_ERR(ts))
- return;
- }
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- else
- dout(" mds%d target mds%d %p is %s\n", session->s_mds,
- i, ts, session_state_name(ts->s_state));
- ceph_put_mds_session(ts);
+ ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+ if (!IS_ERR(ts))
+ ceph_put_mds_session(ts);
}
}
@@ -986,7 +1012,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, false);
if (!__ceph_is_any_real_caps(ci)) {
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1132,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
return 0;
}
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, u64 seq)
+{
+ struct ceph_msg *msg;
+
+ dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+ session->s_mds, session_state_name(session->s_state), seq);
+ msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+
/*
* Note new cap ttl, and any transition from stale -> not stale (fresh?).
*
@@ -1210,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
{
struct ceph_mds_session *session = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
- int used, oissued, mine;
+ int used, wanted, oissued, mine;
if (session->s_trim_caps <= 0)
return -1;
@@ -1218,22 +1259,25 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
spin_lock(&ci->i_ceph_lock);
mine = cap->issued | cap->implemented;
used = __ceph_caps_used(ci);
+ wanted = __ceph_caps_file_wanted(ci);
oissued = __ceph_caps_issued_other(ci, cap);
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+ dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used));
- if (ci->i_dirty_caps)
- goto out; /* dirty caps */
- if ((used & ~oissued) & mine)
+ ceph_cap_string(used), ceph_cap_string(wanted));
+ if (cap == ci->i_auth_cap) {
+ if (ci->i_dirty_caps | ci->i_flushing_caps)
+ goto out;
+ if ((used | wanted) & CEPH_CAP_ANY_WR)
+ goto out;
+ }
+ if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
session->s_trim_caps--;
if (oissued) {
/* we aren't the only cap.. just remove us */
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, true);
} else {
/* try to drop referring dentries */
spin_unlock(&ci->i_ceph_lock);
@@ -1267,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
trim_caps - session->s_trim_caps);
session->s_trim_caps = 0;
}
+
+ ceph_add_cap_releases(mdsc, session);
+ ceph_send_cap_releases(mdsc, session);
return 0;
}
@@ -1416,17 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
unsigned num;
dout("discard_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
+ if (!list_empty(&session->s_cap_releases)) {
+ /* zero out the in-progress message */
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n",
+ session->s_mds, msg, num);
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ session->s_num_cap_releases += num;
+ }
/* requeue completed messages */
while (!list_empty(&session->s_cap_releases_done)) {
@@ -1443,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
msg->front.iov_len = sizeof(*head);
list_add(&msg->list_head, &session->s_cap_releases);
}
-
- spin_unlock(&session->s_cap_lock);
}
/*
* requests
*/
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+ size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+ sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+ int order, num_entries;
+
+ spin_lock(&ci->i_ceph_lock);
+ num_entries = ci->i_files + ci->i_subdirs;
+ spin_unlock(&ci->i_ceph_lock);
+ num_entries = max(num_entries, 1);
+ num_entries = min(num_entries, opt->max_readdir);
+
+ order = get_order(size * num_entries);
+ while (order >= 0) {
+ rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+ order);
+ if (rinfo->dir_in)
+ break;
+ order--;
+ }
+ if (!rinfo->dir_in)
+ return -ENOMEM;
+
+ num_entries = (PAGE_SIZE << order) / size;
+ num_entries = min(num_entries, opt->max_readdir);
+
+ rinfo->dir_buf_size = PAGE_SIZE << order;
+ req->r_num_caps = num_entries + 1;
+ req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+ return 0;
+}
+
/*
* Create an mds request.
*/
@@ -1474,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_stamp = CURRENT_TIME;
+
req->r_op = op;
req->r_direct_mode = mode;
return req;
@@ -1699,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
}
len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+ pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+ sizeof(struct timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -1716,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free2;
}
+ msg->hdr.version = 2;
msg->hdr.tid = cpu_to_le64(req->r_tid);
head = msg->front.iov_base;
@@ -1752,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
head->num_releases = cpu_to_le16(releases);
+ /* time stamp */
+ ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1875,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
int mds = -1;
int err = -EAGAIN;
- if (req->r_err || req->r_got_result)
+ if (req->r_err || req->r_got_result) {
+ if (req->r_aborted)
+ __unregister_request(mdsc, req);
goto out;
+ }
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
@@ -2009,7 +2103,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_locked_dir)
ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
+ if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
@@ -2131,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* dup? */
if ((req->r_got_unsafe && !head->safe) ||
(req->r_got_safe && head->safe)) {
- pr_warning("got a dup %s reply on %llu from mds%d\n",
+ pr_warn("got a dup %s reply on %llu from mds%d\n",
head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
if (req->r_got_safe && !head->safe) {
- pr_warning("got unsafe after safe on %llu from mds%d\n",
+ pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -2154,26 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/
if (result == -ESTALE) {
dout("got ESTALE on request %llu", req->r_tid);
- if (!req->r_inode) {
- /* do nothing; not an authority problem */
- } else if (req->r_direct_mode != USE_AUTH_MDS) {
+ if (req->r_direct_mode != USE_AUTH_MDS) {
dout("not using auth, setting for that now");
req->r_direct_mode = USE_AUTH_MDS;
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
} else {
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap = NULL;
-
- if (req->r_session)
- cap = ceph_get_cap_for_mds(ci,
- req->r_session->s_mds);
-
- dout("already using auth");
- if ((!cap || cap != ci->i_auth_cap) ||
- (cap->mseq != req->r_sent_on_mseq)) {
- dout("but cap changed, so resending");
+ int mds = __choose_mds(mdsc, req);
+ if (mds >= 0 && mds != req->r_session->s_mds) {
+ dout("but auth changed, so resending");
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -2186,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (head->safe) {
req->r_got_safe = true;
__unregister_request(mdsc, req);
- complete_all(&req->r_safe_completion);
if (req->r_got_unsafe) {
/*
@@ -2238,8 +2321,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
- req->r_op == CEPH_MDS_OP_LSSNAP) &&
- rinfo->dir_nr)
+ req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
@@ -2400,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session,
trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
break;
+ case CEPH_SESSION_FLUSHMSG:
+ send_flushmsg_ack(mdsc, session, seq);
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -2490,6 +2576,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
+ cap->cap_gen = cap->session->s_cap_gen;
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2552,6 +2639,8 @@ encode_again:
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
+
+ recon_state->nr_caps++;
out_free:
kfree(path);
out_dput:
@@ -2579,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
+ int s_nr_caps;
struct ceph_pagelist *pagelist;
struct ceph_reconnect_state recon_state;
@@ -2610,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
dout("session %p state %s\n", session,
session_state_name(session->s_state));
+ spin_lock(&session->s_gen_ttl_lock);
+ session->s_cap_gen++;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ spin_lock(&session->s_cap_lock);
+ /*
+ * notify __ceph_remove_cap() that we are composing cap reconnect.
+ * If a cap get released before being added to the cap reconnect,
+ * __ceph_remove_cap() should skip queuing cap release.
+ */
+ session->s_cap_reconnect = 1;
/* drop old cap expires; we're about to reestablish that state */
discard_cap_releases(mdsc, session);
+ spin_unlock(&session->s_cap_lock);
/* traverse this session's caps */
- err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+ s_nr_caps = session->s_nr_caps;
+ err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
if (err)
goto fail;
+ recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
+ spin_lock(&session->s_cap_lock);
+ session->s_cap_reconnect = 0;
+ spin_unlock(&session->s_cap_lock);
+
/*
* snaprealms. we provide mds with the ino, seq (version), and
* parent for all of our realms. If the mds has any newer info,
@@ -2646,11 +2754,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (recon_state.flock)
reply->hdr.version = cpu_to_le16(2);
- if (pagelist->length) {
- /* set up outbound data if we have any */
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+
+ /* raced with cap release? */
+ if (s_nr_caps != recon_state.nr_caps) {
+ struct page *page = list_first_entry(&pagelist->head,
+ struct page, lru);
+ __le32 *addr = kmap_atomic(page);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ kunmap_atomic(addr);
}
+
+ reply->hdr.data_len = cpu_to_le32(pagelist->length);
+ ceph_msg_data_add_pagelist(reply, pagelist);
ceph_con_send(&session->s_con, reply);
mutex_unlock(&session->s_mutex);
@@ -3417,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_warning("mds%d closed our session\n", s->s_mds);
+ pr_warn("mds%d closed our session\n", s->s_mds);
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c2a19fbbe51..e00737cf523 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
/* for readdir results */
struct {
struct ceph_mds_reply_dirfrag *dir_dir;
+ size_t dir_buf_size;
int dir_nr;
char **dir_dname;
u32 *dir_dname_len;
@@ -132,6 +133,7 @@ struct ceph_mds_session {
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
+ int s_cap_reconnect;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
@@ -192,6 +194,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ struct timespec r_stamp;
/* for choosing which mds to send this request to */
int r_direct_mode;
@@ -345,7 +348,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct dentry *dn);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -382,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 132b64eeecd..261531e55e9 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -62,7 +62,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_decode_16_safe(p, end, version, bad);
if (version > 3) {
- pr_warning("got mdsmap version %d > 3, failing", version);
+ pr_warn("got mdsmap version %d > 3, failing", version);
goto bad;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0..51cc23e4811 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
case CEPH_SESSION_STALE: return "stale";
case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+ case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
}
return "???";
}
@@ -52,6 +54,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
+ case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6a0951e4304..06150fd745a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -144,7 +144,11 @@ enum {
Opt_ino32,
Opt_noino32,
Opt_fscache,
- Opt_nofscache
+ Opt_nofscache,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ Opt_acl,
+#endif
+ Opt_noacl
};
static match_table_t fsopt_tokens = {
@@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"},
{Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ {Opt_acl, "acl"},
+#endif
+ {Opt_noacl, "noacl"},
{-1, NULL}
};
@@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ case Opt_acl:
+ fsopt->sb_flags |= MS_POSIXACL;
+ break;
+#endif
+ case Opt_noacl:
+ fsopt->sb_flags &= ~MS_POSIXACL;
+ break;
default:
BUG_ON(token);
}
@@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
else
seq_puts(m, ",nofsc");
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ if (fsopt->sb_flags & MS_POSIXACL)
+ seq_puts(m, ",acl");
+ else
+ seq_puts(m, ",noacl");
+#endif
+
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -490,10 +513,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const unsigned supported_features =
+ const u64 supported_features =
CEPH_FEATURE_FLOCK |
CEPH_FEATURE_DIRLAYOUTHASH;
- const unsigned required_features = 0;
+ const u64 required_features = 0;
int page_count;
size_t size;
int err = -ENOMEM;
@@ -686,6 +709,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
+ .drop_inode = ceph_drop_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
@@ -819,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)
s->s_flags = fsc->mount_options->sb_flags;
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
+ s->s_xattr = ceph_xattr_handlers;
s->s_fs_info = fsc;
fsc->sb = s;
@@ -906,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
struct ceph_options *opt = NULL;
dout("ceph_mount\n");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ flags |= MS_POSIXACL;
+#endif
err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
if (err < 0) {
res = ERR_PTR(err);
@@ -997,6 +1026,7 @@ static int __init init_ceph(void)
if (ret)
goto out;
+ ceph_flock_init();
ceph_xattr_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6014b0a3c40..12b20744e38 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -13,6 +13,7 @@
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
+#include <linux/posix_acl.h>
#include <linux/ceph/libceph.h>
@@ -265,7 +266,6 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with complete dir */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
@@ -287,9 +287,6 @@ struct ceph_inode_info {
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
- int i_cap_exporting_mds; /* to handle cap migration between */
- unsigned i_cap_exporting_mseq; /* mds's. */
- unsigned i_cap_exporting_issued;
struct ceph_cap_reservation i_cap_migration_resv;
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
@@ -335,7 +332,6 @@ struct ceph_inode_info {
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
struct work_struct i_revalidate_work;
#endif
-
struct inode vfs_inode; /* at end */
};
@@ -529,6 +525,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
}
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask);
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
@@ -577,7 +575,7 @@ struct ceph_file_info {
/* readdir: position within a frag */
unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
@@ -691,6 +689,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -724,6 +723,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
/* xattr.c */
extern int ceph_setxattr(struct dentry *, const char *, const void *,
size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern int ceph_removexattr(struct dentry *, const char *);
@@ -732,24 +734,57 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
+/* acl.c */
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+ forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
+ struct inode *dir)
+{
+ return 0;
+}
+
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+ return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
+
/* caps.c */
extern const char *ceph_cap_string(int c);
extern void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
- spin_lock(&cap->ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- spin_unlock(&cap->ci->i_ceph_lock);
-}
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
u64 cap_id, u32 migrate_seq, u32 issue_seq);
@@ -836,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
extern const struct export_operations ceph_export_ops;
/* locks.c */
+extern __init void ceph_flock_init(void);
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532..c9c2b887381 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,16 +6,33 @@
#include <linux/ceph/decode.h>
#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
#define XATTR_CEPH_PREFIX "ceph."
#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr);
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ NULL,
+};
+
static bool ceph_is_valid_xattr(const char *name)
{
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
@@ -47,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
+ size_t size)
{
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
+ char buf[128];
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->map_sem);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size,
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+ if (pool_name) {
+ size_t len = strlen(pool_name);
+ ret = snprintf(buf, sizeof(buf),
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- pool_name);
- else
- ret = snprintf(val, size,
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ if (!size) {
+ ret += len;
+ } else if (ret + len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, ret);
+ memcpy(val + ret, pool_name, len);
+ ret += len;
+ }
+ } else {
+ ret = snprintf(buf, sizeof(buf),
"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout),
(unsigned long long)pool);
-
+ if (size) {
+ if (ret <= size)
+ memcpy(val, buf, ret);
+ else
+ ret = -ERANGE;
+ }
+ }
up_read(&osdc->map_sem);
return ret;
}
@@ -198,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
.name_size = sizeof("ceph.dir.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -225,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
.name_size = sizeof("ceph.file.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -305,8 +338,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
static int __set_xattr(struct ceph_inode_info *ci,
const char *name, int name_len,
const char *val, int val_len,
- int dirty,
- int should_free_name, int should_free_val,
+ int flags, int update_xattr,
struct ceph_inode_xattr **newxattr)
{
struct rb_node **p;
@@ -335,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr = NULL;
}
+ if (update_xattr) {
+ int err = 0;
+ if (xattr && (flags & XATTR_CREATE))
+ err = -EEXIST;
+ else if (!xattr && (flags & XATTR_REPLACE))
+ err = -ENODATA;
+ if (err) {
+ kfree(name);
+ kfree(val);
+ return err;
+ }
+ if (update_xattr < 0) {
+ if (xattr)
+ __remove_xattr(ci, xattr);
+ kfree(name);
+ return 0;
+ }
+ }
+
if (!xattr) {
new = 1;
xattr = *newxattr;
xattr->name = name;
xattr->name_len = name_len;
- xattr->should_free_name = should_free_name;
+ xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
dout("__set_xattr count=%d\n", ci->i_xattrs.count);
@@ -350,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (xattr->should_free_val)
kfree((void *)xattr->val);
- if (should_free_name) {
+ if (update_xattr) {
kfree((void *)name);
name = xattr->name;
}
@@ -365,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->val = "";
xattr->val_len = val_len;
- xattr->dirty = dirty;
- xattr->should_free_val = (val && should_free_val);
+ xattr->dirty = update_xattr;
+ xattr->should_free_val = (val && update_xattr);
if (new) {
rb_link_node(&xattr->node, parent, p);
@@ -428,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr)
{
if (!xattr)
- return -EOPNOTSUPP;
+ return -ENODATA;
rb_erase(&xattr->node, &ci->i_xattrs.index);
@@ -574,7 +625,7 @@ start:
p += len;
err = __set_xattr(ci, name, namelen, val, len,
- 0, 0, 0, &xattrs[numattr]);
+ 0, 0, &xattrs[numattr]);
if (err < 0)
goto bad;
@@ -663,10 +714,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
}
}
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
int err;
struct ceph_inode_xattr *xattr;
@@ -675,7 +725,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
-
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
@@ -725,6 +774,15 @@ out:
return err;
}
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ size_t size)
+{
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, value, size);
+
+ return __ceph_getxattr(dentry->d_inode, name, value, size);
+}
+
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = dentry->d_inode;
@@ -800,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
int err;
@@ -829,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
dout("setxattr value=%.*s\n", (int)size, value);
+ if (!value)
+ flags |= CEPH_XATTR_REMOVE;
+
/* do request */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
USE_AUTH_MDS);
@@ -848,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
req->r_data_len = size;
dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
@@ -863,15 +921,15 @@ out:
return err;
}
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int __ceph_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued;
int err;
- int dirty;
+ int dirty = 0;
int name_len = strlen(name);
int val_len = size;
char *newname = NULL;
@@ -879,9 +937,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -935,12 +990,14 @@ retry:
goto retry;
}
- err = __set_xattr(ci, newname, name_len, newval,
- val_len, 1, 1, 1, &xattr);
+ err = __set_xattr(ci, newname, name_len, newval, val_len,
+ flags, value ? 1 : -1, &xattr);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ if (!err) {
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ ci->i_xattrs.dirty = true;
+ inode->i_ctime = CURRENT_TIME;
+ }
spin_unlock(&ci->i_ceph_lock);
if (dirty)
@@ -958,12 +1015,23 @@ out:
return err;
}
+int ceph_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ return __ceph_setxattr(dentry, name, value, size, flags);
+}
+
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
- struct inode *parent_inode;
struct ceph_mds_request *req;
int err;
@@ -977,14 +1045,12 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
req->r_num_caps = 1;
req->r_path2 = kstrdup(name, GFP_NOFS);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
-int ceph_removexattr(struct dentry *dentry, const char *name)
+int __ceph_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = dentry->d_inode;
struct ceph_vxattr *vxattr;
@@ -994,9 +1060,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
int required_blob_size;
int dirty;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -1053,3 +1116,13 @@ out:
return err;
}
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+ if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ return __ceph_removexattr(dentry, name);
+}