diff options
Diffstat (limited to 'fs/ceph')
| -rw-r--r-- | fs/ceph/Kconfig | 26 | ||||
| -rw-r--r-- | fs/ceph/Makefile | 2 | ||||
| -rw-r--r-- | fs/ceph/acl.c | 194 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 512 | ||||
| -rw-r--r-- | fs/ceph/cache.c | 402 | ||||
| -rw-r--r-- | fs/ceph/cache.h | 182 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 777 | ||||
| -rw-r--r-- | fs/ceph/debugfs.c | 12 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 329 | ||||
| -rw-r--r-- | fs/ceph/export.c | 253 | ||||
| -rw-r--r-- | fs/ceph/file.c | 974 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 704 | ||||
| -rw-r--r-- | fs/ceph/ioctl.c | 155 | ||||
| -rw-r--r-- | fs/ceph/ioctl.h | 2 | ||||
| -rw-r--r-- | fs/ceph/locks.c | 170 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 588 | ||||
| -rw-r--r-- | fs/ceph/mds_client.h | 23 | ||||
| -rw-r--r-- | fs/ceph/mdsmap.c | 62 | ||||
| -rw-r--r-- | fs/ceph/snap.c | 23 | ||||
| -rw-r--r-- | fs/ceph/strings.c | 7 | ||||
| -rw-r--r-- | fs/ceph/super.c | 155 | ||||
| -rw-r--r-- | fs/ceph/super.h | 187 | ||||
| -rw-r--r-- | fs/ceph/xattr.c | 508 |
23 files changed, 4301 insertions, 1946 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 9eb134ea6eb..264e9bf83ff 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,6 +1,6 @@ config CEPH_FS - tristate "Ceph distributed file system (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Ceph distributed file system" + depends on INET select CEPH_LIB select LIBCRC32C select CRYPTO_AES @@ -16,3 +16,25 @@ config CEPH_FS If unsure, say N. +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" + depends on CEPH_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index bd352125e82..85a4230b9bf 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -9,3 +9,5 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 00000000000..469f2e8657e --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,194 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, + int type) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct posix_acl *acl = ACL_NOT_CACHED; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + acl = get_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); + + return acl; +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + dentry = d_find_alias(inode); + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_dput; + } + + ret = __ceph_setxattr(dentry, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_dput; + } + + ceph_set_cached_acl(inode, type, acl); + +out_dput: + dput(dentry); +out_free: + kfree(value); +out: + return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (!default_acl && !acl) + cache_no_acl(inode); + + if (default_acl) { + error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 173b1d22e59..90b3954d48e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -11,6 +11,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/osd_client.h> /* @@ -54,7 +55,12 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ + if (PagePrivate(page)) + return (void *)page->private; + return NULL; +} /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -65,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; - int undo = 0; struct ceph_snap_context *snapc; + int ret; if (unlikely(!mapping)) return !TestSetPageDirty(page); - if (TestSetPageDirty(page)) { + if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); return 0; } @@ -102,35 +109,19 @@ static int ceph_set_page_dirty(struct page *page) snapc, snapc->seq, snapc->num_snaps); spin_unlock(&ci->i_ceph_lock); - /* now adjust page */ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, page->mapping); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - - /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. - */ - page->private = (unsigned long)snapc; - SetPagePrivate(page); - } else { - dout("ANON set_page_dirty %p (raced truncate?)\n", page); - undo = 1; - } - - spin_unlock_irq(&mapping->tree_lock); - - if (undo) - /* whoops, we failed to dirty the page */ - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + ret = __set_page_dirty_nobuffers(page); + WARN_ON(!PageLocked(page)); + WARN_ON(!page->mapping); - BUG_ON(!PageDirty(page)); - return 1; + return ret; } /* @@ -138,18 +129,26 @@ static int ceph_set_page_dirty(struct page *page) * dirty page counters appropriately. Only called if there is private * data on the page. */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = (void *)page->private; - - BUG_ON(!PageLocked(page)); - BUG_ON(!page->private); - BUG_ON(!PagePrivate(page)); - BUG_ON(!page->mapping); + struct ceph_snap_context *snapc = page_snap_context(page); inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + if (!PagePrivate(page)) + return; /* * We can get non-dirty pages here due to races between @@ -159,32 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0) - ClearPageChecked(page); + ClearPageChecked(page); - ci = ceph_inode(inode); - if (offset == 0) { - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", - inode, page, page->index, offset); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); - } else { - dout("%p invalidatepage %p idx %lu partial dirty page\n", - inode, page, page->index); - } + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); } -/* just a sanity check */ static int ceph_releasepage(struct page *page, gfp_t g) { struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", inode, page, page->index); WARN_ON(PageDirty(page)); - WARN_ON(page->private); - WARN_ON(PagePrivate(page)); - return 0; + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); } /* @@ -192,29 +187,39 @@ static int ceph_releasepage(struct page *page, gfp_t g) */ static int readpage_nounlock(struct file *filp, struct page *page) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; + err = ceph_readpage_from_fscache(inode, page); + + if (err == 0) + goto out; + dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - page->index << PAGE_CACHE_SHIFT, &len, + (u64) page_offset(page), &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } + if (err < PAGE_CACHE_SIZE) /* zero fill remainder of page */ zero_user_segment(page, err, PAGE_CACHE_SIZE); - } + else + flush_dcache_page(page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); out: return err < 0 ? err : 0; @@ -233,22 +238,24 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - int rc, bytes; + struct ceph_osd_data *osd_data; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; int i; - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - rc = le32_to_cpu(replyhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ - for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { - struct page *page = req->r_pages[i]; + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; + if (rc < 0) + goto unlock; if (bytes < (int)PAGE_CACHE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -258,10 +265,21 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) page->index); flush_dcache_page(page); SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); +unlock: unlock_page(page); page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - kfree(req->r_pages); + kfree(osd_data->pages); +} + +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + unlock_page(pages[i]); } /* @@ -274,6 +292,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; struct ceph_osd_request *req; u64 off; u64 len; @@ -283,7 +302,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) int nr_pages = 0; int ret; - off = page->index << PAGE_CACHE_SHIFT; + off = (u64) page_offset(page); /* count pages */ next_index = page->index; @@ -298,18 +317,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) len = nr_pages << PAGE_CACHE_SHIFT; dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, off, len); - - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), - off, &len, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, - NULL, false, 1, 0); - if (!req) - return -ENOMEM; + false); + if (IS_ERR(req)) + return PTR_ERR(req); /* build page vector */ - nr_pages = len >> PAGE_CACHE_SHIFT; + nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); ret = -ENOMEM; if (!pages) @@ -318,11 +336,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) page = list_entry(page_list->prev, struct page, lru); BUG_ON(PageLocked(page)); list_del(&page->lru); - + dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_NOFS)) { + ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", inode, page); @@ -331,11 +350,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) } pages[i] = page; } - req->r_pages = pages; - req->r_num_pages = nr_pages; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); req->r_callback = finish_read; req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -344,6 +364,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) return nr_pages; out_pages: + ceph_unlock_page_vector(pages, nr_pages); ceph_release_page_vector(pages, nr_pages); out: ceph_osdc_put_request(req); @@ -358,16 +379,23 @@ out: static int ceph_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned nr_pages) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int rc = 0; int max = 0; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); @@ -376,6 +404,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, BUG_ON(rc == 0); } out: + ceph_fscache_readpages_cancel(inode, page_list); + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -423,13 +453,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_inode_info *ci; struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page->index << PAGE_CACHE_SHIFT; - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -443,7 +472,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc == NULL) { dout("writepage %p page %p not dirty?\n", inode, page); goto out; @@ -451,7 +480,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) oldest = get_oldest_context(inode, &snap_size); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", - inode, page, (void *)page->private); + inode, page, snapc); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); ceph_put_snap_context(oldest); @@ -459,13 +488,20 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", inode, page, page->index, page_off, len, snapc); @@ -475,13 +511,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + ceph_readpage_to_fscache(inode, page); + set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, - &page, 1, 0, 0, true); + truncate_seq, truncate_size, + &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -531,7 +568,6 @@ static void ceph_release_pages(struct page **pages, int num) pagevec_release(&pvec); } - /* * async writeback completion handler. * @@ -542,27 +578,24 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - __s32 rc = -EIO; - u64 bytes = 0; + int rc = req->r_result; + u64 bytes = req->r_ops[0].extent.length; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -570,7 +603,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -579,8 +612,8 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -591,7 +624,7 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_put_snap_context((void *)page->private); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); dout("unlocking %d %p\n", i, page); @@ -609,35 +642,18 @@ static void writepages_finish(struct ceph_osd_request *req, unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } /* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_fs_client *fsc, - struct ceph_osd_request *req) -{ - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } -} - -/* * initiate async writeback */ static int ceph_writepages_start(struct address_space *mapping, @@ -645,7 +661,8 @@ static int ceph_writepages_start(struct address_space *mapping, { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -657,24 +674,24 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - fsc = ceph_inode_to_client(inode); if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { - pr_warning("writepage_start %p on forced umount\n", inode); + pr_warn("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) @@ -703,6 +720,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -710,8 +728,18 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -722,15 +750,16 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; long writeback_stat; next = 0; @@ -779,11 +808,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -795,7 +821,7 @@ get_more_pages: } /* only if matching snap context */ - pgsnapc = (void *)page->private; + pgsnapc = page_snap_context(page); if (pgsnapc->seq > snapc->seq) { dout("page snapc %p %lld > oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); @@ -811,35 +837,42 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + BUG_ON(pages); /* prepare async write request */ - offset = (unsigned long long)page->index - << PAGE_CACHE_SHIFT; + offset = (u64)page_offset(page); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 1, 0); - - if (!req) { - rc = -ENOMEM; + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); + if (IS_ERR(req)) { + rc = PTR_ERR(req); unlock_page(page); break; } - max_pages = req->r_num_pages; - - alloc_page_vec(fsc, req); req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -857,7 +890,7 @@ get_more_pages: } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -886,20 +919,27 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->extent.length = cpu_to_le64(len); - op->payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; + + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); @@ -966,7 +1006,7 @@ static int ceph_update_writeable_page(struct file *file, loff_t pos, unsigned len, struct page *page) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; @@ -984,7 +1024,7 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap @@ -1075,7 +1115,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; int r; @@ -1105,7 +1145,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); @@ -1144,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * never get called. */ static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t pos, unsigned long nr_segs) + struct iov_iter *iter, + loff_t pos) { WARN_ON(1); return -EINVAL; @@ -1168,27 +1208,83 @@ const struct address_space_operations ceph_aops = { /* * vm ops */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + ret = filemap_fault(vma, vmf); + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} /* * Reuse write_begin here for simplicity. */ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; - struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - loff_t off = page->index << PAGE_CACHE_SHIFT; - loff_t size, len; - int ret; + struct page *page = vmf->page; + loff_t off = page_offset(page); + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; - size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else len = size & ~PAGE_CACHE_MASK; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); lock_page(page); @@ -1210,15 +1306,28 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) + if (ret != VM_FAULT_LOCKED) { unlock_page(page); + } else { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + return ret; } static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1229,6 +1338,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 00000000000..834f9f3723f --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,402 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + struct timespec mtime; + loff_t size; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_fs_client* fsc = cookie_netfs_data; + uint16_t klen; + + klen = sizeof(fsc->client->fsid); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &fsc->client->fsid, klen); + return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + fsc, true); + + if (fsc->fscache == NULL) { + pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + return 0; + } + + fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); + if (fsc->revalidate_wq == NULL) + return -ENOMEM; + + return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + uint16_t klen; + + /* use ceph virtual inode (id + snaphot) */ + klen = sizeof(ci->i_vino); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &ci->i_vino, klen); + return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct ceph_aux_inode aux; + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + memcpy(buffer, &aux, sizeof(aux)); + + return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + *size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ + struct ceph_inode_info* ci = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dout("ceph inode 0x%p now uncached", ci); + + while (1) { + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = ceph_fscache_inode_get_key, + .get_attr = ceph_fscache_inode_get_attr, + .get_aux = ceph_fscache_inode_get_aux, + .check_aux = ceph_fscache_inode_check_aux, + .now_uncached = ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci) +{ + struct inode* inode = &ci->vfs_inode; + + /* No caching for filesystem */ + if (fsc->fscache == NULL) + return; + + /* Only cache for regular files that are read only */ + if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + return; + + /* Avoid multiple racing open requests */ + mutex_lock(&inode->i_mutex); + + if (ci->fscache) + goto done; + + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, true); + fscache_check_consistency(ci->fscache); +done: + mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ + return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && + (ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_vfs_readpage_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_vfs_readpage_complete_unlock, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!PageFsCache(page)) + return; + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fsc->revalidate_wq) + destroy_workqueue(fsc->revalidate_wq); + + fscache_relinquish_cookie(fsc->fscache, 0); + fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ + int issued; + u32 orig_gen; + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_revalidate_work); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + if (!(issued & CEPH_CAP_FILE_CACHE)) { + dout("revalidate_work lost cache before validation %p\n", + inode); + goto out; + } + + if (!fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + + spin_lock(&ci->i_ceph_lock); + /* Update the new valid generation (backwards sanity check too) */ + if (orig_gen > ci->i_fscache_gen) { + ci->i_fscache_gen = orig_gen; + } + spin_unlock(&ci->i_ceph_lock); + +out: + iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + return; + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, + &ci->i_revalidate_work)) { + dout("ceph_queue_revalidate %p\n", inode); + } else { + dout("ceph_queue_revalidate %p failed\n)", inode); + iput(inode); + } +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + /* The first load is verifed cookie open time */ + ci->i_fscache_gen = 1; + INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 00000000000..5ac591bd012 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,182 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, + struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 620daad201d..1fde164b74b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -10,6 +10,7 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> @@ -147,7 +148,7 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_mds_client *mdsc, +void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { int i; @@ -155,7 +156,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); @@ -174,14 +174,15 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -197,13 +198,6 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; } int ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -227,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc, return 0; } -static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -236,8 +230,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { + spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } @@ -484,19 +480,22 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * i_rdcache_gen. */ if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* - * if we are newly issued FILE_SHARED, clear D_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ if ((issued & CEPH_CAP_FILE_SHARED) && (had & CEPH_CAP_FILE_SHARED) == 0) { ci->i_shared_gen++; - if (S_ISDIR(ci->vfs_inode.i_mode)) - ceph_dir_clear_complete(&ci->vfs_inode); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + dout(" marking %p NOT complete\n", &ci->vfs_inode); + __ceph_dir_clear_complete(ci); + } } } @@ -509,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; @@ -532,44 +530,45 @@ int ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); -retry: - spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&ci->i_ceph_lock); - new_cap = get_cap(mdsc, caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); - } else if (new_cap) - ceph_put_cap(mdsc, new_cap); + } else { + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } if (!ci->i_snap_realm) { /* @@ -607,10 +606,15 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } + } else { + WARN_ON(ci->i_auth_cap == cap); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -618,7 +622,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (ceph_seq_cmp(mseq, cap->mseq) > 0) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -626,9 +633,6 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); - wake_up_all(&ci->i_cap_wq); - return 0; } /* @@ -663,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -679,6 +683,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -786,22 +799,28 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&ci->i_ceph_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && + (cap->implemented & ~cap->issued & mask)) + return 1; } + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); @@ -850,7 +869,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } @@ -860,7 +882,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* @@ -869,7 +903,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci) * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; @@ -881,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap) /* remove from session list */ spin_lock(&session->s_cap_lock); + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -928,7 +972,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, umode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -972,8 +1016,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -987,15 +1031,14 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1005,7 +1048,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); @@ -1024,7 +1067,6 @@ static void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* @@ -1039,12 +1081,8 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } @@ -1079,8 +1117,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct timespec mtime, atime; int wake = 0; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; @@ -1349,8 +1387,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (!ci->i_head_snapc) ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); - dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, - ci->i_head_snapc); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -1454,7 +1493,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1557,9 +1596,14 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -1587,7 +1631,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1664,8 +1708,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1694,13 +1738,12 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: spin_lock(&ci->i_ceph_lock); @@ -1714,13 +1757,14 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { + if (!session || session != cap->session) { spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; @@ -1739,7 +1783,7 @@ retry: out: spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1824,7 +1868,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1859,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); @@ -1955,8 +1999,15 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, cap = ci->i_auth_cap; dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + __ceph_flush_snaps(ci, &session, 1); + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, __ceph_caps_used(ci), __ceph_caps_wanted(ci), @@ -2027,11 +2078,20 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } - if (need & CEPH_CAP_FILE_WR) { + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode); + spin_lock(&ci->i_ceph_lock); + } + + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { + if (endoff > ci->i_requested_max_size) { *check_max = 1; ret = 1; } @@ -2046,13 +2106,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } } - have = __ceph_caps_issued(ci, &implemented); - - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; if ((have & need) == need) { /* @@ -2094,14 +2147,17 @@ static void check_max_size(struct inode *inode, loff_t endoff) /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { dout("write %p at large endoff %llu, req max_size\n", inode, endoff); ci->i_wanted_max_size = endoff; - check = 1; } + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); @@ -2287,41 +2343,88 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + void *snaptrace, int snaptrace_len, + struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(ci->i_ceph_lock) + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - int wake = 0; - int writeback = 0; - int revoked_rdcache = 0; - int queue_invalidate = 0; + bool wake = 0; + bool writeback = 0; + bool queue_trunc = 0; + bool queue_invalidate = 0; + bool queue_revalidate = 0; + bool deleted_inode = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we @@ -2330,9 +2433,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2340,27 +2441,33 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } + + ceph_fscache_invalidate(inode); } /* side effects now are allowed */ - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = 1; + } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2373,29 +2480,44 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - /* max size increase? */ - if (max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = 1; + + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; } - wake = 1; } /* check cap bits */ @@ -2410,14 +2532,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } - cap->seq = seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; @@ -2444,6 +2563,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -2453,6 +2577,24 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, BUG_ON(cap->issued & ~cap->implemented); spin_unlock(&ci->i_ceph_lock); + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, false); + downgrade_write(&mdsc->snap_rwsem); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = 1; + } + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2462,6 +2604,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); @@ -2618,8 +2762,10 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) + if (queue_trunc) { ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } } /* @@ -2631,122 +2777,200 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, - struct ceph_mds_session *session, - int *open_target_sessions) + struct ceph_mds_cap_peer *ph, + struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; - } - if (t->session->s_mds == mds) - cap = t; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; - - /* - * make sure we have open sessions with all possible - * export targets, so that we get the matching IMPORT - */ - *open_target_sessions = 1; + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ - /* - * we can't flush dirty caps that we've seen the - * EXPORT but no IMPORT for - */ - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p to cap_dirty_migrating\n", - inode); - list_move(&ci->i_dirty_item, - &mdsc->cap_dirty_migrating); + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; } - /* else, we already released it */ spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); + } + ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; + } + goto retry; + +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; + + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; } - spin_unlock(&mdsc->cap_dirty_lock); + cap = new_cap; } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } } - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - kick_flushing_inode_caps(mdsc, session, inode); - up_read(&mdsc->snap_rwsem); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); + ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); + + *old_issued = issued; + *target_cap = cap; } /* @@ -2764,8 +2988,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; - int op; + int op, issued; u32 seq, mseq; struct ceph_vino vino; u64 cap_id; @@ -2774,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; - int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2797,22 +3023,36 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -2821,9 +3061,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); + spin_unlock(&session->s_cap_lock); + } goto flush_cap_releases; } @@ -2834,13 +3077,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session, &open_target_sessions); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, - snaptrace, snaptrace_len); - ceph_check_caps(ceph_inode(inode), 0, session); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &issued); + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + msg->middle, session, cap, issued); goto done_unlocked; } @@ -2858,7 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, + session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -2891,8 +3138,6 @@ done: done_unlocked: if (inode) iput(inode); - if (open_target_sessions) - ceph_mdsc_open_export_target_sessions(mdsc, session); return; bad: @@ -3010,21 +3255,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, @@ -3036,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fb962efdace..5a743ac141a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); if (req->r_got_unsafe) - seq_printf(s, "\t(unsafe)"); + seq_puts(s, "\t(unsafe)"); else - seq_printf(s, "\t"); + seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); @@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { @@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry_dir), + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); @@ -116,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, " %s", req->r_path2); } - seq_printf(s, "\n"); + seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); @@ -201,6 +204,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) int err = -ENOMEM; dout("ceph_fs_debugfs_init\n"); + BUG_ON(!fsc->client->debugfs_dir); fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e8094be460..c29d6ae6887 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,7 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ - ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); @@ -79,7 +78,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) return NULL; spin_lock(&dentry->d_lock); - if (dentry->d_parent) { + if (!IS_ROOT(dentry)) { inode = dentry->d_parent->d_inode; ihold(inode); } @@ -101,6 +100,14 @@ static unsigned fpos_off(loff_t p) return p & 0xffffffff; } +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -108,15 +115,15 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * D_COMPLETE tells indicates we have all dentries in the dir. It is + * Complete dir indicates that we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) { - struct ceph_file_info *fi = filp->private_data; - struct dentry *parent = filp->f_dentry; + struct ceph_file_info *fi = file->private_data; + struct dentry *parent = file->f_dentry; struct inode *dir = parent->d_inode; struct list_head *p; struct dentry *dentry, *last; @@ -127,14 +134,14 @@ static int __dcache_readdir(struct file *filp, last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, - last); + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); spin_lock(&parent->d_lock); /* start at beginning? */ - if (filp->f_pos == 2 || last == NULL || - filp->f_pos < ceph_dentry(last)->offset) { + if (ctx->pos == 2 || last == NULL || + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -155,14 +162,15 @@ more: goto out_unlock; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (!d_unhashed(dentry) && dentry->d_inode && + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - filp->f_pos <= di->offset) + fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, - filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + ctx->pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; @@ -174,36 +182,34 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - filp->f_pos = di->offset; - err = filldir(dirent, dentry->d_name.name, - dentry->d_name.len, di->offset, + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12); - - if (last) { - if (err < 0) { + dentry->d_inode->i_mode >> 12)) { + if (last) { /* remember our position */ fi->dentry = last; - fi->next_offset = di->offset; - } else { - dput(last); + fi->next_offset = fpos_off(di->offset); } + dput(dentry); + return 0; } - last = dentry; - - if (err < 0) - goto out; - filp->f_pos++; + ctx->pos = di->offset + 1; - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_test_complete(dir)) { - dout(" lost D_COMPLETE on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } + if (last) + dput(last); + last = dentry; spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ @@ -236,61 +242,62 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } -static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(filp->f_pos); - int off = fpos_off(filp->f_pos); + unsigned frag = fpos_frag(ctx->pos); + int off = fpos_off(ctx->pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = fsc->mount_options->max_readdir; - const int max_bytes = fsc->mount_options->max_readdir_bytes; - dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); + dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ - if (filp->f_pos == 0) { + if (ctx->pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - fi->dir_release_count = ci->i_release_count; + fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); - if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), + if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 1; + ctx->pos = 1; off = 1; } - if (filp->f_pos == 1) { - ino_t ino = parent_ino(filp->f_dentry); + if (ctx->pos == 1) { + ino_t ino = parent_ino(file->f_dentry); dout("readdir off 1 -> '..'\n"); - if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), + if (!dir_emit(ctx, "..", 2, ceph_translate_ino(inode->i_sb, ino), - inode->i_mode >> 12) < 0) + inode->i_mode >> 12)) return 0; - filp->f_pos = 2; + ctx->pos = 2; off = 2; } /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((filp->f_pos == 2 || fi->dentry) && + if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - ceph_dir_test_complete(inode) && + __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { + u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); - err = __dcache_readdir(filp, dirent, filldir); + err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -318,17 +325,19 @@ more: fi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } req->r_inode = inode; ihold(inode); - req->r_dentry = dget(filp->f_dentry); + req->r_dentry = dget(file->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); @@ -336,9 +345,6 @@ more: req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -351,10 +357,21 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude D_COMPLETE */ + /* preclude from marking dir complete */ + fi->dir_release_count--; } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; @@ -366,7 +383,6 @@ more: else fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); @@ -379,15 +395,16 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); + + ctx->pos = ceph_make_fpos(frag, off); while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; struct ceph_vino vino; ino_t ino; dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, pos, + off, off - fi->offset, rinfo->dir_nr, ctx->pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); @@ -395,16 +412,15 @@ more: vino.ino = le64_to_cpu(in->ino); vino.snap = le64_to_cpu(in->snapid); ino = ceph_vino_to_ino(vino); - if (filldir(dirent, + if (!dir_emit(ctx, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } off++; - filp->f_pos = pos + 1; + ctx->pos++; } if (fi->last_name) { @@ -417,7 +433,7 @@ more: if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); off = 0; - filp->f_pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, off); dout("readdir next frag is %x\n", frag); goto more; } @@ -429,17 +445,17 @@ more: * the complete dir contents in our cache. */ spin_lock(&ci->i_ceph_lock); - if (ci->i_release_count == fi->dir_release_count) { - ceph_dir_set_complete(inode); - ci->i_max_offset = filp->f_pos; + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count); } spin_unlock(&ci->i_ceph_lock); - dout("readdir %p filp %p done.\n", inode, filp); + dout("readdir %p file %p done.\n", inode, file); return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -447,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi) } kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; /* compensate for . and .. */ + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; if (fi->dentry) { dput(fi->dentry); fi->dentry = NULL; @@ -455,16 +474,16 @@ static void reset_readdir(struct ceph_file_info *fi) fi->flags &= ~CEPH_F_ATEND; } -static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; mutex_lock(&inode->i_mutex); retval = -EINVAL; - switch (origin) { + switch (whence) { case SEEK_END: offset += inode->i_size + 2; /* FIXME */ break; @@ -476,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) goto out; } - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; @@ -489,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) * seek to new frag, or seek prior to current chunk. */ if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || + fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(fi, fpos_frag(offset)); } /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) + if (fpos_cmp(offset, old_offset) > 0) fi->dir_release_count--; } out: @@ -576,7 +595,7 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) * the MDS so that it gets our 'caps wanted' value in a single op. */ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -594,14 +613,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (err < 0) return ERR_PTR(err); - /* open (but not create!) intent? */ - if (nd && - (nd->flags & LOOKUP_OPEN) && - !(nd->intent.open.flags & O_CREAT)) { - int mode = nd->intent.open.create_mode & ~current->fs->umask; - return ceph_lookup_open(dir, dentry, nd, mode, 1); - } - /* can we conclude ENOENT locally? */ if (dentry->d_inode == NULL) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -613,7 +624,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - ceph_dir_test_complete(dir) && + __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); @@ -648,7 +659,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, */ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) { - struct dentry *result = ceph_lookup(dir, dentry, NULL); + struct dentry *result = ceph_lookup(dir, dentry, 0); if (result && !IS_ERR(result)) { /* @@ -694,31 +705,18 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, - struct nameidata *nd) + bool excl) { - dout("create in dir %p dentry %p name '%.*s'\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name); - - if (ceph_snap(dir) != CEPH_NOSNAP) - return -EROFS; - - if (nd) { - BUG_ON((nd->flags & LOOKUP_OPEN) == 0); - dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); - /* hrm, what should i do here if we get aliased? */ - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; - } - - /* fall back to mknod */ - return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); + return ceph_mknod(dir, dentry, mode, 0); } static int ceph_symlink(struct inode *dir, struct dentry *dentry, @@ -748,7 +746,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -789,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: - if (err < 0) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -814,11 +816,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry = dget(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; err = ceph_mdsc_do_request(mdsc, dir, req); if (err) { d_drop(dentry); @@ -911,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); - req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry); + req->r_old_dentry_dir = old_dir; req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -932,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; @@ -1028,12 +1034,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { int valid = 0; struct inode *dir; - if (nd && nd->flags & LOOKUP_RCU) + if (flags & LOOKUP_RCU) return -ECHILD; dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, @@ -1052,14 +1058,19 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - valid = 1; + if (dentry->d_inode) + valid = ceph_is_any_caps(dentry->d_inode); + else + valid = 1; } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) + if (valid) { ceph_dentry_lru_touch(dentry); - else + } else { + ceph_dir_clear_complete(dir); d_drop(dentry); + } iput(dir); return valid; } @@ -1080,7 +1091,7 @@ static void ceph_d_release(struct dentry *dentry) } static int ceph_snapdir_d_revalidate(struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { /* * Eventually, we'll want to revalidate snapped metadata @@ -1090,44 +1101,6 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, } /* - * Set/clear/test dir complete flag on the dir's dentry. - */ -void ceph_dir_set_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry) && - ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -void ceph_dir_clear_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); -} - -bool ceph_dir_test_complete(struct inode *inode) -{ - struct dentry *dentry = d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) NOT complete\n", inode, dentry); - clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } - dput(dentry); - return false; -} - -/* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. * @@ -1135,15 +1108,13 @@ bool ceph_dir_test_complete(struct inode *inode) */ static void ceph_d_prune(struct dentry *dentry) { - struct ceph_dentry_info *di; - dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ - if (!dentry->d_parent || IS_ROOT(dentry)) + if (IS_ROOT(dentry)) return; - /* if we are not hashed, we don't affect D_COMPLETE */ + /* if we are not hashed, we don't affect dir's completeness */ if (d_unhashed(dentry)) return; @@ -1151,8 +1122,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - di = ceph_dentry(dentry->d_parent); - clear_bit(CEPH_D_COMPLETE, &di->flags); + ceph_dir_clear_complete(dentry->d_parent->d_inode); } /* @@ -1163,7 +1133,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct ceph_file_info *cf = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; const int bufsize = 1024; @@ -1213,7 +1183,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct list_head *head = &ci->i_unsafe_dirops; struct ceph_mds_request *req; @@ -1332,7 +1302,7 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .readdir = ceph_readdir, + .iterate = ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, @@ -1349,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, @@ -1357,6 +1329,7 @@ const struct inode_operations ceph_dir_iops = { .rmdir = ceph_unlink, .rename = ceph_rename, .create = ceph_create, + .atomic_open = ceph_atomic_open, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fbb2a643ef1..8d7d782f438 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -8,23 +8,6 @@ #include "mds_client.h" /* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ - -/* * Basic fh */ struct ceph_nfs_fh { @@ -32,22 +15,18 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); -static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, - int connectable) +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) { int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent; - struct inode *inode = dentry->d_inode; int connected_handle_length = sizeof(*cfh)/4; int handle_length = sizeof(*fh)/4; @@ -55,41 +34,31 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; - if (*max_len >= connected_handle_length) { - dout("encode_fh %p connectable\n", dentry); - cfh->ino = ceph_ino(dentry->d_inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode, - dentry); + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } + + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); + cfh->ino = ceph_ino(inode); + cfh->parent_ino = ceph_ino(parent_inode); *max_len = connected_handle_length; - type = 2; - } else if (*max_len >= handle_length) { - if (connectable) { - *max_len = connected_handle_length; - type = 255; - } else { - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = handle_length; - type = 1; - } + type = FILEID_INO32_GEN_PARENT; } else { + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); *max_len = handle_length; - type = 255; + type = FILEID_INO32_GEN; } - spin_unlock(&dentry->d_lock); return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; @@ -97,8 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, struct ceph_vino vino; int err; - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; + vino.ino = ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { @@ -122,132 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); return dentry; } /* - * convert connectable fh to dentry + * convert regular fh to dentry */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) +{ + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; - struct ceph_vino vino; int err; - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - inode = req->r_target_inode; - if (inode) - ihold(inode); - ceph_mdsc_put_request(req); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + if (child) { + req->r_inode = child->d_inode; + ihold(child->d_inode); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(child->d_inode) : ino, + dentry, ceph_vinop(inode)); return dentry; } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); + /* don't re-export snaps */ + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(child->d_inode)); + return __get_parent(child->d_sb, child, 0); } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*cfh) / 4) + return NULL; - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; - } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ed72428d9c7..302085100c2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -4,11 +4,15 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> +#include <linux/falloc.h> #include "super.h" #include "mds_client.h" +#include "cache.h" /* * Ceph file operations @@ -54,7 +58,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) req->r_fmode = ceph_flags_to_mode(flags); req->r_args.open.flags = cpu_to_le32(flags); req->r_args.open.mode = cpu_to_le32(create_mode); - req->r_args.open.preferred = cpu_to_le32(-1); out: return req; } @@ -67,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: + /* First file open request creates the cookie, we want to keep + * this cookie around for the filetime of the inode as not to + * have to worry about fscache register / revoke / operation + * races. + * + * Also, if we know the operation is going to invalidate data + * (non readonly) just nuke the cache right away. + */ + ceph_fscache_register_inode_cookie(mdsc->fsc, ci); + if ((fmode & CEPH_FILE_MODE_WR)) + ceph_fscache_invalidate(inode); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -107,9 +124,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* - * If the filp already has private_data, that means the file was - * already opened by intent during lookup, and we do nothing. - * * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -183,6 +197,7 @@ int ceph_open(struct inode *inode, struct file *file) spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); @@ -193,8 +208,9 @@ int ceph_open(struct inode *inode, struct file *file) } req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; - if (flags & (O_CREAT|O_TRUNC)) + if (flags & O_CREAT) parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); @@ -208,36 +224,34 @@ out: /* - * Do a lookup + open with a single request. - * - * If this succeeds, but some subsequent check in the vfs - * may_open() fails, the struct *file gets cleaned up (i.e. - * ceph_release gets called). So fear not! - */ -/* - * flags - * path_lookup_open -> LOOKUP_OPEN - * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE + * Do a lookup + open with a single request. If we get a non-existent + * file or symlink, return 1 so the VFS can retry. */ -struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir) +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file; struct ceph_mds_request *req; - struct dentry *ret; + struct dentry *dn; int err; - int flags = nd->intent.open.flags; - dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", - dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); + dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + err = ceph_init_dentry(dentry); + if (err < 0) + return err; /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) - return ERR_CAST(req); + return PTR_ERR(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { @@ -248,21 +262,41 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - err = ceph_handle_snapdir(req, dentry, err); if (err) - goto out; - if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + goto out_err; + + err = ceph_handle_snapdir(req, dentry, err); + if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + + if (d_unhashed(dentry)) { + dn = ceph_finish_lookup(req, dentry, err); + if (IS_ERR(dn)) + err = PTR_ERR(dn); + } else { + /* we were given a hashed negative dentry */ + dn = NULL; + } if (err) - goto out; - file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); - if (IS_ERR(file)) - err = PTR_ERR(file); -out: - ret = ceph_finish_lookup(req, dentry, err); + goto out_err; + if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + /* make vfs retry on splice, ENOENT, or symlink */ + dout("atomic_open finish_no_open on dn %p\n", dn); + err = finish_no_open(file, dn); + } else { + dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + ceph_init_acl(dentry, dentry->d_inode, dir); + *opened |= FILE_CREATED; + } + err = finish_open(file, dentry, ceph_open, opened); + } +out_err: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", ret); - return ret; + dout("atomic_open result=%d\n", err); + return err; } int ceph_release(struct inode *inode, struct file *file) @@ -299,9 +333,9 @@ static int striped_read(struct inode *inode, { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; + u64 pos, this_len, left; int io_align, page_align; - int left, pages_left; + int pages_left; int read; struct page **page_pos; int ret; @@ -332,47 +366,40 @@ more: ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - if (ret > 0) { - int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; - - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_align + read, - pos - off - read, pages); + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + u64 tmp = min(this_len - ret, + inode->i_size - pos - ret); + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + tmp); + ceph_zero_page_vector_range(page_align + read + ret, + tmp, pages); + ret += tmp; } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) goto more; } - if (was_short) { + if (read > 0) { + ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) *checkeof = 1; - - /* zero trailing bytes (inside i_size) */ - if (left > 0 && pos < inode->i_size) { - if (pos + left > inode->i_size) - left = inode->i_size - pos; - - dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_align + read, left, - pages); - read += left; - } } - if (ret >= 0) - ret = read; dout("striped_read returns %d\n", ret); return ret; } @@ -383,235 +410,361 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { - struct inode *inode = file->f_dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; + u64 off = iocb->ki_pos; int num_pages, ret; + size_t len = iov_iter_count(i); - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); - /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); if (ret < 0) - goto done; + return ret; - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + if (file->f_flags & O_DIRECT) { + while (iov_iter_count(i)) { + size_t start; + ssize_t n; - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); + if (n < 0) + return n; -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; + + ret = striped_read(inode, off, n, + pages, num_pages, checkeof, + 1, start); + + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < n) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + int copy = min_t(size_t, PAGE_SIZE, left); + l = copy_page_to_iter(pages[k++], 0, copy, i); + off += l; + left -= l; + if (l < copy) + break; + } + } ceph_release_page_vector(pages, num_pages); + } + + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } + dout("sync_read result %d\n", ret); return ret; } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t *offset) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file->f_dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; int num_pages; - long long unsigned pos; - u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; - int page_align, io_align; - unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); - if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, *offset, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_APPEND) - pos = i_size_read(inode); - else - pos = *offset; + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); + (pos + count) >> PAGE_CACHE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - do_sync = 1; - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - len = left; - if (file->f_flags & O_DIRECT) { - /* write from beginning of first page, regardless of - io alignment */ - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - num_pages = calc_pages_for((unsigned long)data, len); - } else { - page_align = pos & ~PAGE_MASK; - num_pages = calc_pages_for(pos, len); - } - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, - ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2, page_align); - if (!req) - return -ENOMEM; + while (iov_iter_count(from) > 0) { + u64 len = iov_iter_single_seg_count(from); + size_t start; + ssize_t n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, false); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto out; + n = iov_iter_get_pages_alloc(from, &pages, len, &start); + if (unlikely(n < 0)) { + ret = n; + ceph_osdc_put_request(req); + break; } + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; /* * throw out any page cache pages in this range. this * may block. */ truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { + (pos+n) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + + ceph_osdc_put_request(req); + if (ret) + break; + pos += n; + written += n; + iov_iter_advance(from, n); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; } - ret = ceph_copy_user_to_page_vector(pages, data, pos, len); + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = copy_page_from_iter(pages[n], 0, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + } + if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; - } - } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_inode = inode; + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_item, - &ci->i_unsafe_writes); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - if (ret < 0 && req->r_safe_callback) { - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); - } - } + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, false); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); out: - ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - left -= len; - data += written; - if (left) - goto more; + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *offset = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); + iocb->ki_pos = pos; } return ret; } @@ -623,62 +776,69 @@ out: * * Hmm, the sync read case isn't actually async... should it be? */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; - struct inode *inode = filp->f_dentry->d_inode; + size_t len = iocb->ki_nbytes; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: - __ceph_do_pending_vmtruncate(inode); + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + return ret; if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) + (fi->flags & CEPH_F_SYNC)) { + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = ceph_sync_read(iocb, to, &checkeof); + } else { + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); -out: + ret = generic_file_read_iter(iocb, to); + } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); + if (statret == 0 && iocb->ki_pos < inode->i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + + iov_iter_advance(to, ret); read += ret; - base += ret; len -= ret; checkeof = 0; goto again; } } + if (ret >= 0) ret += read; @@ -695,74 +855,100 @@ out: * * If we are near ENOSPC, write synchronously. */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - loff_t endoff = pos + iov->iov_len; - int want, got = 0; - int ret, err; + ssize_t count = iov_iter_count(from), written = 0; + int err, want, got; + loff_t pos = iocb->ki_pos; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + mutex_lock(&inode->i_mutex); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = file->f_mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + iov_iter_truncate(from, count); + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; - __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - inode->i_size); + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; + goto out; + } + + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); - if (ret < 0) - goto out_put; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + if (err < 0) + goto out; - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS) || - (fi->flags & CEPH_F_SYNC)) { - ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, - &iocb->ki_pos); + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct iov_iter data; + mutex_unlock(&inode->i_mutex); + /* we might need to revert back to that point */ + data = *from; + if (file->f_flags & O_DIRECT) + written = ceph_sync_direct_write(iocb, &data); + else + written = ceph_sync_write(iocb, &data); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" + "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), + pos, (unsigned)count); + mutex_lock(&inode->i_mutex); + goto retry_snap; + } + if (written > 0) + iov_iter_advance(from, written); } else { + loff_t old_size = inode->i_size; /* - * buffered write; drop Fw early to avoid slow - * revocation if we get stuck on balance_dirty_pages + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time */ - int dirty; - - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - ceph_put_cap_refs(ci, got); - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); - if (err < 0) - ret = err; - } - - if (dirty) - __mark_inode_dirty(inode, dirty); - goto out; + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); + mutex_unlock(&inode->i_mutex); } - if (ret >= 0) { + if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); @@ -771,34 +957,39 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode, ceph_vinop(inode), pos, (unsigned)count, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); -out: - if (ret == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); - goto retry_snap; + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; } - return ret; + goto out_unlocked; + +out: + mutex_unlock(&inode->i_mutex); +out_unlocked: + current->backing_dev_info = NULL; + return written ? written : err; } /* * llseek. be sure to verify file size on SEEK_END. */ -static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); - if (origin == SEEK_END || origin == SEEK_DATA || origin == SEEK_HOLE) { + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; @@ -806,7 +997,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) } } - switch (origin) { + switch (whence) { case SEEK_END: offset += inode->i_size; break; @@ -838,37 +1029,224 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) break; } - if (offset < 0 || offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; + } + if (length) + ceph_zero_partial_page(inode, offset, length); +} + +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); goto out; } - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; } + ceph_osdc_put_request(req); out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + s32 object_size = ceph_file_layout_object_size(ci->i_layout); + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + ret = -ENOSPC; + goto unlock; + } + + size = i_size_read(inode); + if (!(mode & FALLOC_FL_KEEP_SIZE)) + endoff = offset + length; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + if (ret < 0) + goto unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset < size) + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + } else if (endoff > size) { + truncate_pagecache_range(inode, size, -1); + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, NULL); + } + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: mutex_unlock(&inode->i_mutex); - return offset; + return ret; } const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = ceph_aio_read, - .aio_write = ceph_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, .mmap = ceph_mmap, .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, + .fallocate = ceph_fallocate, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2c489378b4c..04c89c266ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -9,9 +9,12 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> +#include <linux/posix_acl.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" +#include "cache.h" #include <linux/ceph/decode.h> /* @@ -94,6 +97,8 @@ const struct inode_operations ceph_file_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -175,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) * specified, copy the frag delegation info to the caller if * it is present. */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) { u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; @@ -187,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, if (found) *found = 0; - mutex_lock(&ci->i_fragtree_mutex); while (1) { WARN_ON(!ceph_frag_contains_value(t, v)); frag = __ceph_find_frag(ci, t); @@ -216,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } dout("choose_frag(%x) = %x\n", v, t); - mutex_unlock(&ci->i_fragtree_mutex); return t; } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + /* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only @@ -233,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode, u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; int i; int err = 0; + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&ci->i_fragtree_mutex); - if (ndist == 0) { + if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) @@ -282,6 +300,75 @@ out: return err; } +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *rb_node; + int i; + u32 id, nsplits; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + frag->split_by = le32_to_cpu(fragtree->splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} /* * initialize a newly allocated inode. @@ -302,7 +389,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_release_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -333,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; @@ -343,6 +428,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; + mutex_init(&ci->i_truncate_mutex); ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; @@ -376,6 +462,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + ceph_fscache_inode_init(ci); + return &ci->vfs_inode; } @@ -395,11 +483,13 @@ void ceph_destroy_inode(struct inode *inode) dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + ceph_fscache_unregister_inode_cookie(ci); + ceph_queue_caps_release(inode); /* * we may still have a snap_realm reference if there are stray - * caps in i_cap_exporting_issued or i_snap_caps. + * caps in i_snap_caps. */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = @@ -429,6 +519,15 @@ void ceph_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ceph_i_callback); } +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} /* * Helpers to fill in size, ctime, mtime, and atime. We have to be @@ -454,16 +553,20 @@ int ceph_fill_file_size(struct inode *inode, int issued, dout("truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL| - CEPH_CAP_FILE_LAZYIO)) || + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; @@ -477,6 +580,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + return queue_trunc; } @@ -557,21 +664,26 @@ static int fill_inode(struct inode *inode, unsigned long ttl_from, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int i; - int issued = 0, implemented; - int updating_inode = 0; + int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; - u32 nsplits; struct ceph_buffer *xattr_blob = NULL; + struct ceph_cap *new_cap = NULL; int err = 0; - int queue_trunc = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* prealloc new cap struct */ + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + new_cap = ceph_get_cap(mdsc, caps_reservation); + /* * prealloc xattr data, if it looks like we'll need it. only * if len > 4 (meaning there are actually xattrs; the first 4 @@ -597,52 +709,60 @@ static int fill_inode(struct inode *inode, * 3 2 skip * 3 3 update */ - if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) >= le64_to_cpu(info->version)) - goto no_change; - - updating_inode = 1; + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; + issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); + new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ ci->i_version = le64_to_cpu(info->version); inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); - inode->i_uid = le32_to_cpu(info->uid); - inode->i_gid = le32_to_cpu(info->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); - /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); - queue_trunc = ceph_fill_file_size(inode, issued, - le32_to_cpu(info->truncate_seq), - le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); - ceph_fill_file_time(inode, issued, - le32_to_cpu(info->time_warp_seq), - &ctime, &mtime, &atime); - - /* only update max_size on auth cap */ - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && - ci->i_max_size != le64_to_cpu(info->max_size)) { - dout("max_size %lld -> %llu\n", ci->i_max_size, - le64_to_cpu(info->max_size)); - ci->i_max_size = le64_to_cpu(info->max_size); + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); } - ci->i_layout = info->layout; - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ @@ -655,6 +775,7 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); xattr_blob = NULL; } @@ -677,18 +798,19 @@ static int fill_inode(struct inode *inode, case S_IFLNK: inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { - int symlen = iinfo->symlink_len; + u32 symlen = iinfo->symlink_len; char *sym; - BUG_ON(symlen != inode->i_size); spin_unlock(&ci->i_ceph_lock); + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; + err = -ENOMEM; - sym = kmalloc(symlen+1, GFP_NOFS); + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); if (!sym) goto out; - memcpy(sym, iinfo->symlink, symlen); - sym[symlen] = 0; spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -715,27 +837,16 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode), inode->i_mode); } -no_change: - spin_unlock(&ci->i_ceph_lock); - - /* queue truncate if we saw i_size decrease */ - if (queue_trunc) - ceph_queue_vmtruncate(inode); - - /* populate frag tree */ - /* FIXME: move me up, if/when version reflects fragtree changes */ - nsplits = le32_to_cpu(info->fragtree.nsplits); - mutex_lock(&ci->i_fragtree_mutex); - for (i = 0; i < nsplits; i++) { - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); - dout(" frag %x split by %d\n", frag->frag, frag->split_by); + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); } - mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ if (info->cap.caps) { @@ -748,43 +859,41 @@ no_change: le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), - info->cap.flags, - caps_reservation); + info->cap.flags, &new_cap); + wake = true; } else { - spin_lock(&ci->i_ceph_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&ci->i_ceph_lock); } } else if (cap_fmode >= 0) { - pr_warning("mds issued no caps on %llx.%llx\n", + pr_warn("mds issued no caps on %llx.%llx\n", ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + spin_unlock(&ci->i_ceph_lock); - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - updating_inode && /* didn't jump to no_change */ - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !ceph_dir_test_complete(inode)) { - dout(" marking %p complete (empty)\n", inode); - ceph_dir_set_complete(inode); - ci->i_max_offset = 2; - } + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); err = 0; - out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); return err; @@ -841,41 +950,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!ceph_dir_test_complete(inode)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -884,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -904,8 +978,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " "inode %p ino %llx.%llx\n", - dn, dn->d_count, - realdn, realdn->d_count, + dn, d_count(dn), + realdn, d_count(realdn), realdn->d_inode, ceph_vinop(realdn->d_inode)); dput(dn); dn = realdn; @@ -916,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -938,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - int i = 0; int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -991,11 +1061,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (rinfo->head->is_dentry) { struct inode *dir = req->r_locked_dir; - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1, - &req->r_caps_reservation); - if (err < 0) - return err; + if (dir) { + err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1, + &req->r_caps_reservation); + if (err < 0) + goto done; + } else { + WARN_ON_ONCE(1); + } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (!req->r_aborted && rinfo->head->result == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } } /* @@ -1003,6 +1149,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * will have trouble splicing in the virtual snapdir later */ if (rinfo->head->is_dentry && !req->r_aborted && + req->r_locked_dir && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { @@ -1036,6 +1183,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1055,18 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when holding - * D_COMPLETE. - */ - ceph_set_dentry_offset(req->r_old_dentry); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1088,107 +1234,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = dn->d_inode; - if (!in) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } - dn = splice_dentry(dn, in, &have_lease, true); + if (!dn->d_inode) { + ceph_dir_clear_complete(dir); + ihold(in); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - ihold(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); - dn = splice_dentry(dn, in, NULL, true); + ceph_dir_clear_complete(dir); + ihold(in); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - ihold(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ } +done: + dout("fill_trace done err=%d\n", err); + return err; +} - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + int i, err = 0; - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + struct inode *in; + int rc; - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + + in = ceph_get_inode(req->r_dentry->d_sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + dout("new_inode badness got %d\n", err); + continue; + } + rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (rc < 0) { + pr_err("fill_inode badness on %p got %d\n", in, rc); + err = rc; + continue; } } -done: - dout("fill_trace done err=%d\n", err); return err; } -/* - * Prepopulate our cache with readdir results, leases, etc. - */ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1197,11 +1323,26 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(parent->d_inode); @@ -1215,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1239,9 +1381,10 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) { + ret = ceph_init_dentry(dn); + if (ret < 0) { dput(dn); + err = ret; goto out; } } else if (dn->d_inode && @@ -1261,9 +1404,6 @@ retry_lookup: spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; @@ -1271,31 +1411,44 @@ retry_lookup: in = ceph_get_inode(parent->d_sb, vino); if (IS_ERR(in)) { dout("new_inode badness\n"); - d_delete(dn); + d_drop(dn); dput(dn); err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL, false); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { @@ -1379,18 +1532,21 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; + mutex_lock(&ci->i_truncate_mutex); spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - /* nevermind! */ + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); goto out; } orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(&inode->i_data, 0); + truncate_pagecache(inode, 0); spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1403,18 +1559,20 @@ static void ceph_invalidate_work(struct work_struct *work) dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", inode, orig_gen, ci->i_rdcache_gen, ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } spin_unlock(&ci->i_ceph_lock); - + mutex_unlock(&ci->i_truncate_mutex); +out: if (check) ceph_check_caps(ci, 0, NULL); -out: iput(inode); } /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1425,9 +1583,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1440,6 +1596,7 @@ void ceph_queue_vmtruncate(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); ihold(inode); + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); @@ -1451,8 +1608,6 @@ void ceph_queue_vmtruncate(struct inode *inode) } /* - * called with i_mutex held. - * * Make sure any pending truncation is applied before doing anything * that may depend on it. */ @@ -1460,13 +1615,15 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; - int wrbuffer_refs, wake = 0; + int wrbuffer_refs, finish = 0; + mutex_lock(&ci->i_truncate_mutex); retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); return; } @@ -1483,26 +1640,33 @@ retry: goto retry; } + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + to = ci->i_truncate_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, to); + truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - ci->i_truncate_pending--; - if (ci->i_truncate_pending == 0) - wake = 1; + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; + + mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - if (wake) - wake_up_all(&ci->i_cap_wq); -} + wake_up_all(&ci->i_cap_wq); +} /* * symlinks @@ -1517,6 +1681,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, }; /* @@ -1526,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1539,8 +1708,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); - err = inode_change_ok(inode, attr); if (err != 0) return err; @@ -1556,26 +1723,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, - inode->i_uid, attr->ia_uid); + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_uid = attr->ia_uid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_uid != inode->i_uid) { - req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { dout("setattr %p gid %d -> %d\n", inode, - inode->i_gid, attr->ia_gid); + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_gid = attr->ia_gid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_gid != inode->i_gid) { - req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } @@ -1588,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1703,24 +1875,30 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } + if (mask) { req->r_inode = inode; ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + if (mask & CEPH_SETATTR_SIZE) + __ceph_do_pending_vmtruncate(inode); return err; out: spin_unlock(&ci->i_ceph_lock); +out_put: ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 790914a598d..a822a6e5829 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,9 +1,8 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> #include "super.h" #include "mds_client.h" -#include <linux/ceph/ceph_debug.h> - #include "ioctl.h" @@ -16,18 +15,17 @@ */ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) { - struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout l; int err; - err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (!err) { l.stripe_unit = ceph_file_layout_su(ci->i_layout); l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); l.object_size = ceph_file_layout_object_size(ci->i_layout); l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); - l.preferred_osd = - (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -35,61 +33,75 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) return err; } +static long __validate_layout(struct ceph_mds_client *mdsc, + struct ceph_ioctl_layout *l) +{ + int i, err; + + /* validate striping parameters */ + if ((l->object_size & ~PAGE_MASK) || + (l->stripe_unit & ~PAGE_MASK) || + (l->stripe_unit != 0 && + ((unsigned)l->object_size % (unsigned)l->stripe_unit))) + return -EINVAL; + + /* make sure it's a valid data pool */ + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + + return 0; +} + static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { - struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode; + struct inode *inode = file_inode(file); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; - struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout nl; - int err, i; + int err; if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; /* validate changed params against current layout */ - err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); - if (!err) { - nl.stripe_unit = ceph_file_layout_su(ci->i_layout); - nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); - nl.object_size = ceph_file_layout_object_size(ci->i_layout); - nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); - nl.preferred_osd = - (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); - } else + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + if (err) return err; + memset(&nl, 0, sizeof(nl)); if (l.stripe_count) nl.stripe_count = l.stripe_count; + else + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); if (l.stripe_unit) nl.stripe_unit = l.stripe_unit; + else + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); if (l.object_size) nl.object_size = l.object_size; + else + nl.object_size = ceph_file_layout_object_size(ci->i_layout); if (l.data_pool) nl.data_pool = l.data_pool; - if (l.preferred_osd) - nl.preferred_osd = l.preferred_osd; + else + nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); - if ((nl.object_size & ~PAGE_MASK) || - (nl.stripe_unit & ~PAGE_MASK) || - ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) - return -EINVAL; + /* this is obsolete, and always -1 */ + nl.preferred_osd = le64_to_cpu(-1); - /* make sure it's a valid data pool */ - if (l.data_pool > 0) { - mutex_lock(&mdsc->mutex); - err = -EINVAL; - for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) - if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { - err = 0; - break; - } - mutex_unlock(&mdsc->mutex); - if (err) - return err; - } + err = __validate_layout(mdsc, &nl); + if (err) + return err; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, USE_AUTH_MDS); @@ -97,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -106,12 +120,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_object_size = cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - req->r_args.setlayout.layout.fl_pg_preferred = - cpu_to_le32(l.preferred_osd); - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } @@ -124,36 +134,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) */ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_mds_request *req; struct ceph_ioctl_layout l; - int err, i; + int err; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) - return -EINVAL; - - /* make sure it's a valid data pool */ - if (l.data_pool > 0) { - mutex_lock(&mdsc->mutex); - err = -EINVAL; - for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) - if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { - err = 0; - break; - } - mutex_unlock(&mdsc->mutex); - if (err) - return err; - } + err = __validate_layout(mdsc, &l); + if (err) + return err; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, USE_AUTH_MDS); @@ -162,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); @@ -171,8 +165,6 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - req->r_args.setlayout.layout.fl_pg_preferred = - cpu_to_le32(l.preferred_osd); err = ceph_mdsc_do_request(mdsc, inode, req); ceph_mdsc_put_request(req); @@ -186,22 +178,29 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) { struct ceph_ioctl_dataloc dl; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; - struct ceph_object_layout ol; struct ceph_pg pgid; + int r; /* copy and validate */ if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; down_read(&osdc->map_sem); - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, - &dl.object_no, &dl.object_offset, &olen); + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, + &dl.object_no, &dl.object_offset, + &olen); + if (r < 0) { + up_read(&osdc->map_sem); + return -EIO; + } dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); @@ -212,10 +211,16 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, - osdc->osdmap); - pgid = ol.ol_pgid; + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } + dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = @@ -237,7 +242,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) static long ceph_ioctl_lazyio(struct file *file) { struct ceph_file_info *fi = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index be4a6048733..c77028afb1e 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -34,6 +34,8 @@ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; + + /* obsolete. new values ignored, always return -1 */ __s64 preferred_osd; }; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 80576d05d68..fbc39c47bac 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -2,29 +2,50 @@ #include <linux/file.h> #include <linux/namei.h> +#include <linux/random.h> #include "super.h" #include "mds_client.h" #include <linux/ceph/pagelist.h> +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + /** * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; + u64 owner; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) @@ -32,25 +53,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, else length = fl->fl_end - fl->fl_start + 1; - dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type: %d", (int)lock_type, - (int)operation, (u64)fl->fl_pid, fl->fl_start, - length, wait, fl->fl_type); + owner = secure_addr(fl->fl_owner); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); - /* This should be adjusted, but I'm not sure if - namespaces actually get id numbers*/ - req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)(unsigned long)fl->fl_nspid); req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); - if ( operation == CEPH_MDS_OP_GETFILELOCK){ + if (operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; @@ -87,14 +107,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) u8 wait = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_lock, fl_pid:%d", fl->fl_pid); + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ - if (F_SETLKW == cmd) - wait = 1; - if (F_GETLK == cmd) + if (IS_GETLK(cmd)) op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; @@ -105,7 +130,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - if ( op != CEPH_MDS_OP_GETFILELOCK ){ + if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { @@ -131,20 +156,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { u8 lock_cmd; int err; - u8 wait = 1; - - fl->fl_nspid = get_pid(task_tgid(current)); - dout("ceph_flock, fl_pid:%d", fl->fl_pid); - - /* set wait bit, then clear it out of cmd*/ - if (cmd & LOCK_NB) - wait = 0; - cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN); - /* set command sequence that Ceph wants to see: - shared lock, exclusive lock, or unlock */ - if (LOCK_SH == cmd) + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; - else if (LOCK_EX == cmd) + else if (F_WRLCK == fl->fl_type) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; @@ -169,7 +196,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) } /** - * Must be called with BKL already held. Fills in the passed + * Must be called with lock_flocks() already held. Fills in the passed * counter variables, so you can prepare pagelist metadata before calling * ceph_encode_locks. */ @@ -191,27 +218,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) } /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, - int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct ceph_filelock cephlock; int err = 0; int seen_fcntl = 0; int seen_flock = 0; + int l = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); - if (err) - goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { ++seen_fcntl; @@ -219,19 +242,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } - - err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); - if (err) - goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { ++seen_flock; @@ -239,19 +255,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } fail: return err; } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + /* * Given a pointer to a lock, convert it to a ceph filelock */ @@ -259,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock, struct ceph_filelock *cephlock) { int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = - cpu_to_le64((u64)(unsigned long)lock->fl_nspid); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); switch (lock->fl_type) { case F_RDLCK: diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 866e8d7ca37..92a2548278f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3,6 +3,7 @@ #include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -10,6 +11,7 @@ #include "super.h" #include "mds_client.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> @@ -42,6 +44,7 @@ */ struct ceph_reconnect_state { + int nr_caps; struct ceph_pagelist *pagelist; bool flock; }; @@ -61,7 +64,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -96,7 +99,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -143,7 +146,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -163,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; - } + BUG_ON(!info->dir_in); info->dir_dname = (void *)(info->dir_in + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + info->dir_nr = num; while (num) { /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -215,7 +215,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -232,16 +232,45 @@ bad: } /* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* * parse extra results */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; } /* @@ -249,7 +278,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; @@ -296,7 +325,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); } @@ -334,10 +365,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s) dout("mdsc put_session %p %d -> %d\n", s, atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { - if (s->s_authorizer) - s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->fsc->client->monc.auth, - s->s_authorizer); + if (s->s_auth.authorizer) + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -384,6 +415,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); @@ -394,15 +428,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->fsc->client->msgr, &s->s_con); - s->s_con.private = s; - s->s_con.ops = &mds_con_ops; - s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; - s->s_con.peer_name.num = cpu_to_le64(mds); + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; - s->s_cap_ttl = 0; + s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; @@ -414,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -440,7 +471,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->sessions[mds] = s; atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ - ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; @@ -480,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); @@ -496,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { + if (req->r_old_dentry) + dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { /* * track (and drop pins for) r_old_dentry_dir * separately, since r_old_dentry's d_parent may have @@ -505,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref) */ ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); - dput(req->r_old_dentry); iput(req->r_old_dentry_dir); } kfree(req->r_path1); @@ -612,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -679,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc, struct dentry *dn = get_nonsnap_parent(parent); inode = dn->d_inode; dout("__choose_mds using nonsnap parent %p\n", inode); - } else if (req->r_dentry->d_inode) { + } else { /* dentry target */ inode = req->r_dentry->d_inode; - } else { - /* dir + name */ - inode = dir; - hash = ceph_dentry_hash(dir, req->r_dentry); - is_hash = true; + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } } } @@ -812,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc, * * called under mdsc->mutex */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_info *mi; struct ceph_mds_session *ts; int i, mds = session->s_mds; - int target; if (mds >= mdsc->mdsmap->m_max_mds) return; + mi = &mdsc->mdsmap->m_info[mds]; dout("open_export_target_sessions for mds%d (%d targets)\n", session->s_mds, mi->num_export_targets); for (i = 0; i < mi->num_export_targets; i++) { - target = mi->export_targets[i]; - ts = __ceph_lookup_mds_session(mdsc, target); - if (!ts) { - ts = register_session(mdsc, target); - if (IS_ERR(ts)) - return; - } - if (session->s_state == CEPH_MDS_SESSION_NEW || - session->s_state == CEPH_MDS_SESSION_CLOSING) - __open_session(mdsc, session); - else - dout(" mds%d target mds%d %p is %s\n", session->s_mds, - i, ts, session_state_name(ts->s_state)); - ceph_put_mds_session(ts); + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); } } @@ -956,7 +1012,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, false); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; @@ -1001,6 +1057,37 @@ static void remove_session_caps(struct ceph_mds_session *session) { dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct super_block *sb = session->s_mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + spin_unlock(&session->s_cap_lock); + BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); @@ -1071,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -1083,8 +1185,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, int wake = 0; spin_lock(&session->s_cap_lock); - was_stale = is_renew && (session->s_cap_ttl == 0 || - time_after_eq(jiffies, session->s_cap_ttl)); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; @@ -1150,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_mds_session *session = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; if (session->s_trim_caps <= 0) return -1; @@ -1158,20 +1259,25 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } else { /* try to drop referring dentries */ spin_unlock(&ci->i_ceph_lock); @@ -1205,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, trim_caps - session->s_trim_caps); session->s_trim_caps = 0; } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); return 0; } @@ -1354,16 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, unsigned num; dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - session->s_num_cap_releases += num; + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } /* requeue completed messages */ while (!list_empty(&session->s_cap_releases_done)) { @@ -1380,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, msg->front.iov_len = sizeof(*head); list_add(&msg->list_head, &session->s_cap_releases); } - - spin_unlock(&session->s_cap_lock); } /* * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ @@ -1411,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_stamp = CURRENT_TIME; + req->r_op = op; req->r_direct_mode = mode; return req; @@ -1473,11 +1622,6 @@ retry: else len += 1 + temp->d_name.len; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry %p\n", dentry); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (len) @@ -1514,12 +1658,6 @@ retry: if (pos) path[--pos] = '/'; temp = temp->d_parent; - if (temp == NULL) { - rcu_read_unlock(); - pr_err("build_path corrupt dentry\n"); - kfree(path); - return ERR_PTR(-EINVAL); - } } rcu_read_unlock(); if (pos != 0 || read_seqretry(&rename_lock, seq)) { @@ -1536,7 +1674,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, dentry->d_count, *base, len, path); + dentry, d_count(dentry), *base, len, path); return path; } @@ -1604,7 +1742,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, } else if (rpath || rino) { *ino = rino; *ppath = rpath; - *pathlen = strlen(rpath); + *pathlen = rpath ? strlen(rpath) : 0; dout(" path %.*s\n", *pathlen, rpath); } @@ -1647,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -1664,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } + msg->hdr.version = 2; msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -1672,8 +1812,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(req->r_uid); - head->caller_gid = cpu_to_le32(req->r_gid); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -1700,12 +1840,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); head->num_releases = cpu_to_le16(releases); + /* time stamp */ + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + } + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -1819,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { @@ -1890,10 +2040,16 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { - struct ceph_mds_request *req, *nreq; + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); + + list_splice_init(head, &tmp_list); - list_for_each_entry_safe(req, nreq, head, r_wait) { + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } @@ -1947,7 +2103,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) + if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); @@ -2007,20 +2163,16 @@ out: } /* - * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *inode = req->r_locked_dir; - struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode); - spin_lock(&ci->i_ceph_lock); - ceph_dir_clear_complete(inode); - ci->i_release_count++; - spin_unlock(&ci->i_ceph_lock); + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + ceph_dir_clear_complete(inode); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -2073,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* dup? */ if ((req->r_got_unsafe && !head->safe) || (req->r_got_safe && head->safe)) { - pr_warning("got a dup %s reply on %llu from mds%d\n", + pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (req->r_got_safe && !head->safe) { - pr_warning("got unsafe after safe on %llu from mds%d\n", + pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; @@ -2096,26 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) */ if (result == -ESTALE) { dout("got ESTALE on request %llu", req->r_tid); - if (!req->r_inode) { - /* do nothing; not an authority problem */ - } else if (req->r_direct_mode != USE_AUTH_MDS) { + if (req->r_direct_mode != USE_AUTH_MDS) { dout("not using auth, setting for that now"); req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; } else { - struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = NULL; - - if (req->r_session) - cap = ceph_get_cap_for_mds(ci, - req->r_session->s_mds); - - dout("already using auth"); - if ((!cap || cap != ci->i_auth_cap) || - (cap->mseq != req->r_sent_on_mseq)) { - dout("but cap changed, so resending"); + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; @@ -2128,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -2179,8 +2320,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && - rinfo->dir_nr) + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } @@ -2332,7 +2473,7 @@ static void handle_session(struct ceph_mds_session *session, session->s_mds); spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; - session->s_cap_ttl = 0; + session->s_cap_ttl = jiffies - 1; spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; @@ -2341,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2430,6 +2575,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ + cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; if (recon_state->flock) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); @@ -2454,39 +2601,46 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - struct ceph_pagelist_cursor trunc_point; - - ceph_pagelist_set_cursor(pagelist, &trunc_point); - do { - lock_flocks(); - ceph_count_locks(inode, &num_fcntl_locks, - &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - unlock_flocks(); - - /* pre-alloc pagelist */ - ceph_pagelist_truncate(pagelist, &trunc_point); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_pagelist_reserve(pagelist, - rec.v2.flock_len); - - /* encode locks */ - if (!err) { - lock_flocks(); - err = ceph_encode_locks(inode, - pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_flocks(); - } - } while (err == -ENOSPC); + struct ceph_filelock *flocks; + +encode_again: + spin_lock(&inode->i_lock); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + spin_unlock(&inode->i_lock); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + spin_lock(&inode->i_lock); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + spin_unlock(&inode->i_lock); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } + recon_state->nr_caps++; out_free: kfree(path); out_dput: @@ -2514,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; + int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; @@ -2532,7 +2687,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; + ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ @@ -2543,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, session_state_name(session->s_state)); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); if (err) goto fail; + recon_state.nr_caps = 0; recon_state.pagelist = pagelist; recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, @@ -2577,11 +2752,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; if (recon_state.flock) reply->hdr.version = cpu_to_le16(2); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); + } + reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -2637,7 +2821,8 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + if (i >= newmap->m_max_mds || + memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { @@ -3006,8 +3191,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); @@ -3345,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warning("mds%d closed our session\n", s->s_mds); + pr_warn("mds%d closed our session\n", s->s_mds); send_mds_reconnect(mdsc, s); } @@ -3396,39 +3583,37 @@ out: /* * authentication */ -static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) + +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - int ret = 0; - - if (force_new && s->s_authorizer) { - ac->ops->destroy_authorizer(ac, s->s_authorizer); - s->s_authorizer = NULL; - } - if (s->s_authorizer == NULL) { - if (ac->ops->create_authorizer) { - ret = ac->ops->create_authorizer( - ac, CEPH_ENTITY_TYPE_MDS, - &s->s_authorizer, - &s->s_authorizer_buf, - &s->s_authorizer_buf_len, - &s->s_authorizer_reply_buf, - &s->s_authorizer_reply_buf_len); - if (ret) - return ret; - } - } + struct ceph_auth_handshake *auth = &s->s_auth; + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } *proto = ac->protocol; - *buf = s->s_authorizer_buf; - *len = s->s_authorizer_buf_len; - *reply_buf = s->s_authorizer_reply_buf; - *reply_len = s->s_authorizer_reply_buf_len; - return 0; + + return auth; } @@ -3438,7 +3623,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -3447,12 +3632,32 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3461,6 +3666,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 8c7c04ebb59..e00737cf523 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -11,6 +11,7 @@ #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> #include <linux/ceph/mdsmap.h> +#include <linux/ceph/auth.h> /* * Some lock dependencies: @@ -66,6 +67,7 @@ struct ceph_mds_reply_info_parsed { /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; int dir_nr; char **dir_dname; u32 *dir_dname_len; @@ -73,6 +75,12 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in *dir_in; u8 dir_complete, dir_end; }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; }; /* encoded blob describing snapshot contexts for certain @@ -113,9 +121,7 @@ struct ceph_mds_session { struct ceph_connection s_con; - struct ceph_authorizer *s_authorizer; - void *s_authorizer_buf, *s_authorizer_reply_buf; - size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; + struct ceph_auth_handshake s_auth; /* protected by s_gen_ttl_lock */ spinlock_t s_gen_ttl_lock; @@ -127,6 +133,7 @@ struct ceph_mds_session { struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; + int s_cap_reconnect; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; @@ -185,8 +192,9 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - uid_t r_uid; - gid_t r_gid; + kuid_t r_uid; + kgid_t r_gid; + struct timespec r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; @@ -340,7 +348,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, @@ -377,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 73b7d44e8a3..261531e55e9 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; int i; - char r; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; /* count */ for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return -1; /* pick */ - get_random_bytes(&r, 1); - n = r % n; + n = prandom_u32() % n; i = 0; for (i = 0; n > 0; i++, n--) while (m->m_info[i].state <= 0) @@ -59,6 +61,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); ceph_decode_16_safe(p, end, version, bad); + if (version > 3) { + pr_warn("got mdsmap version %d > 3, failing", version); + goto bad; + } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); m->m_epoch = ceph_decode_32(p); @@ -86,6 +92,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -120,37 +127,40 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].laggy = - (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); m->m_num_data_pg_pools = n; - m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) goto badmem; - ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) - m->m_data_pg_pools[i] = ceph_decode_32(p); - m->m_cas_pg_pool = ceph_decode_32(p); + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ dout("mdsmap_decode success epoch %u\n", m->m_epoch); @@ -164,7 +174,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a559c80f127..f01645a2775 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; - int i; - int num = realm->num_prior_parent_snaps + realm->num_snaps; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; /* * build parent context, if it hasn't been built. @@ -321,27 +320,28 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - realm->cached_context->num_snaps); + (unsigned int) realm->cached_context->num_snaps); return 0; } /* alloc new snap context */ err = -ENOMEM; - if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; snapc->seq = realm->seq; if (parent) { + u32 i; + /* include any of parent's snaps occurring _after_ my parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) @@ -361,8 +361,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", - realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); if (realm->cached_context) ceph_put_snap_context(realm->cached_context); @@ -402,9 +403,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm) * helper to allocate and decode an array of snapids. free prior * instance, if any. */ -static int dup_array(u64 **dst, __le64 *src, int num) +static int dup_array(u64 **dst, __le64 *src, u32 num) { - int i; + u32 i; kfree(*dst); if (num) { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index cd5097d7c80..51cc23e4811 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s) case CEPH_MDS_STATE_BOOT: return "up:boot"; case CEPH_MDS_STATE_STANDBY: return "up:standby"; case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; case CEPH_MDS_STATE_CREATING: return "up:creating"; case CEPH_MDS_STATE_STARTING: return "up:starting"; /* up and in */ @@ -40,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } @@ -50,10 +53,14 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; case CEPH_MDS_OP_READDIR: return "readdir"; case CEPH_MDS_OP_MKNOD: return "mknod"; case CEPH_MDS_OP_LINK: return "link"; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 00de2c9568c..06150fd745a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -17,7 +17,9 @@ #include "super.h" #include "mds_client.h" +#include "cache.h" +#include <linux/ceph/ceph_features.h> #include <linux/ceph/decode.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> @@ -70,8 +72,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) /* * express utilization in terms of large blocks to avoid * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); @@ -79,7 +87,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -130,10 +137,18 @@ enum { Opt_nodirstat, Opt_rbytes, Opt_norbytes, + Opt_asyncreaddir, Opt_noasyncreaddir, Opt_dcache, Opt_nodcache, Opt_ino32, + Opt_noino32, + Opt_fscache, + Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + Opt_acl, +#endif + Opt_noacl }; static match_table_t fsopt_tokens = { @@ -153,10 +168,18 @@ static match_table_t fsopt_tokens = { {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, {Opt_dcache, "dcache"}, {Opt_nodcache, "nodcache"}, {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, + {Opt_fscache, "fsc"}, + {Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + {Opt_acl, "acl"}, +#endif + {Opt_noacl, "noacl"}, {-1, NULL} }; @@ -232,6 +255,9 @@ static int parse_fsopt_token(char *c, void *private) case Opt_norbytes: fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; case Opt_noasyncreaddir: fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; @@ -244,6 +270,23 @@ static int parse_fsopt_token(char *c, void *private) case Opt_ino32: fsopt->flags |= CEPH_MOUNT_OPT_INO32; break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; + case Opt_fscache: + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; + case Opt_nofscache: + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL + case Opt_acl: + fsopt->sb_flags |= MS_POSIXACL; + break; +#endif + case Opt_noacl: + fsopt->sb_flags &= ~MS_POSIXACL; + break; default: BUG_ON(token); } @@ -296,7 +339,10 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, { struct ceph_mount_options *fsopt; const char *dev_name_end; - int err = -ENOMEM; + int err; + + if (!dev_name || !*dev_name) + return -EINVAL; fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); if (!fsopt) @@ -317,27 +363,41 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ + /* + * Distinguish the server list from the path in "dev_name". + * Internally we do not include the leading '/' in the path. + * + * "dev_name" will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* skip over leading '/' for path */ + *path = dev_name_end + 1; + } else { + /* path is empty */ + dev_name_end = dev_name + strlen(dev_name); + *path = dev_name_end; + } err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') { + pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; } - dev_name_end = *path; dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - - /* path on server */ - *path += 2; dout("server path '%s'\n", *path); - err = ceph_parse_options(popt, options, dev_name, dev_name_end, + *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); - if (err) + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); goto out; + } /* success */ *pfsopt = fsopt; @@ -375,8 +435,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", opt->osd_timeout); if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) seq_printf(m, ",osdkeepalivetimeout=%d", opt->osd_keepalive_timeout); @@ -391,6 +449,17 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",dcache"); else seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) + seq_puts(m, ",fsc"); + else + seq_puts(m, ",nofsc"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (fsopt->sb_flags & MS_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); @@ -444,10 +513,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; + int page_count; + size_t size; int err = -ENOMEM; fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); @@ -491,16 +562,24 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, /* set up mempools */ err = -ENOMEM; - fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - fsc->mount_options->wsize >> PAGE_CACHE_SHIFT); + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; + /* setup fscache */ + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && + (ceph_fscache_register_fs(fsc) != 0)) + goto fail_fscache; + /* caps */ fsc->min_caps = fsopt->max_readdir; return fsc; +fail_fscache: + ceph_fscache_unregister_fs(fsc); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -520,6 +599,8 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_fscache_unregister_fs(fsc); + destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); @@ -554,6 +635,8 @@ static void ceph_inode_init_once(void *foo) static int __init init_caches(void) { + int error = -ENOMEM; + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), @@ -577,23 +660,33 @@ static int __init init_caches(void) if (ceph_file_cachep == NULL) goto bad_file; - return 0; + if ((error = ceph_fscache_register())) + goto bad_file; + return 0; bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; + return error; } static void destroy_caches(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); + + ceph_fscache_unregister(); } @@ -616,6 +709,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, .show_options = ceph_show_options, @@ -655,9 +749,8 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, dout("open_root_inode success\n"); if (ceph_ino(inode) == CEPH_INO_ROOT && fsc->sb->s_root == NULL) { - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); root = ERR_PTR(-ENOMEM); goto out; } @@ -750,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; fsc->sb = s; @@ -817,7 +911,7 @@ static int ceph_register_bdi(struct super_block *sb, fsc->backing_dev_info.ra_pages = default_backing_dev_info.ra_pages; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &fsc->backing_dev_info; @@ -837,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct ceph_options *opt = NULL; dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + flags |= MS_POSIXACL; +#endif err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); if (err < 0) { res = ERR_PTR(err); @@ -860,7 +958,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, fsc); + sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); if (IS_ERR(sb)) { res = ERR_CAST(sb); goto out; @@ -917,6 +1015,7 @@ static struct file_system_type ceph_fs_type = { .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("ceph"); #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) @@ -927,6 +1026,8 @@ static int __init init_ceph(void) if (ret) goto out; + ceph_flock_init(); + ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; @@ -936,6 +1037,7 @@ static int __init init_ceph(void) return 0; out_icache: + ceph_xattr_exit(); destroy_caches(); out: return ret; @@ -945,6 +1047,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1421f3d875a..12b20744e38 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -13,15 +13,20 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/posix_acl.h> #include <linux/ceph/libceph.h> +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif + /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -29,6 +34,7 @@ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) @@ -90,6 +96,11 @@ struct ceph_fs_client { struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; #endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + struct workqueue_struct *revalidate_wq; +#endif }; @@ -138,8 +149,8 @@ struct ceph_cap_snap { struct ceph_snap_context *context; umode_t mode; - uid_t uid; - gid_t gid; + kuid_t uid; + kgid_t gid; struct ceph_buffer *xattr_blob; u64 xattr_version; @@ -204,7 +215,6 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { - unsigned long flags; struct ceph_mds_session *lease_session; u32 lease_gen, lease_shared_gen; u32 lease_seq; @@ -215,18 +225,6 @@ struct ceph_dentry_info { u64 offset; }; -/* - * dentry flags - * - * The locking for D_COMPLETE is a bit odd: - * - we can clear it at almost any time (see ceph_d_prune) - * - it is only meaningful if: - * - we hold dir inode i_ceph_lock - * - we hold dir FILE_SHARED caps - * - the dentry D_COMPLETE is set - */ -#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */ - struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -257,7 +255,8 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - unsigned long i_release_count; + atomic_t i_release_count; + atomic_t i_complete_count; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -267,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; @@ -289,9 +287,6 @@ struct ceph_inode_info { unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or @@ -300,6 +295,7 @@ struct ceph_inode_info { int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ @@ -331,6 +327,11 @@ struct ceph_inode_info { struct work_struct i_vmtruncate_work; +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; /* sequence, for delayed fscache validate */ + struct work_struct i_revalidate_work; +#endif struct inode vfs_inode; /* at end */ }; @@ -367,7 +368,7 @@ static inline u32 ceph_ino_to_ino32(__u64 vino) u32 ino = vino & 0xffffffff; ino ^= vino >> 32; if (!ino) - ino = 1; + ino = 2; return ino; } @@ -436,33 +437,31 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&ci->i_ceph_lock); + atomic_set(&ci->i_complete_count, release_count); } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); + atomic_inc(&ci->i_release_count); +} - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&ci->i_ceph_lock); +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline void ceph_dir_clear_complete(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; + __ceph_dir_clear_complete(ceph_inode(inode)); +} - spin_lock(&ci->i_ceph_lock); - r = (ci->i_ceph_flags & mask) == mask; - spin_unlock(&ci->i_ceph_lock); - return r; +static inline bool ceph_dir_is_complete(struct inode *inode) +{ + return __ceph_dir_is_complete(ceph_inode(inode)); } @@ -489,13 +488,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) } /* - * set/clear directory D_COMPLETE flag - */ -void ceph_dir_set_complete(struct inode *inode); -void ceph_dir_clear_complete(struct inode *inode); -bool ceph_dir_test_complete(struct inode *inode); - -/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -533,6 +525,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) } extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -555,7 +549,7 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); -extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); @@ -581,10 +575,10 @@ struct ceph_file_info { /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ - unsigned long dir_release_count; + int dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -612,9 +606,9 @@ struct ceph_snap_realm { u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ - int num_prior_parent_snaps; /* had prior to parent_since */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ - int num_snaps; + u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ @@ -695,6 +689,7 @@ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -728,31 +723,71 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, /* xattr.c */ extern int ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); + +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned cap, unsigned seq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - spin_lock(&cap->ci->i_ceph_lock); - __ceph_remove_cap(cap); - spin_unlock(&cap->ci->i_ceph_lock); -} +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, @@ -796,17 +831,11 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; -extern int ceph_copy_to_page_vector(struct page **pages, - const char *data, - loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, - char *data, - loff_t off, size_t len); -extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); + extern int ceph_open(struct inode *inode, struct file *file); -extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened); extern int ceph_release(struct inode *inode, struct file *filp); /* dir.c */ @@ -842,11 +871,17 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern const struct export_operations ceph_export_ops; /* locks.c */ +extern __init void ceph_flock_init(void); extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, - int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a76f697303d..c9c2b887381 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -6,13 +6,33 @@ #include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, "ceph.", 5) || + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -21,103 +41,235 @@ static bool ceph_is_valid_xattr(const char *name) * These define virtual xattrs exposing the recursive directory * statistics and layout metadata. */ -struct ceph_vxattr_cb { - bool readonly; +struct ceph_vxattr { char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); + bool readonly, hidden; + bool (*exists_cb)(struct ceph_inode_info *ci); }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + size_t s; + char *p = (char *)&ci->i_layout; + + for (s = 0; s < sizeof(ci->i_layout); s++, p++) + if (*p) + return true; + return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + char buf[128]; + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + (unsigned long long)pool); + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } + up_read(&osdc->map_sem); + return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, "%s", pool_name); + else + ret = snprintf(val, size, "%lld", (unsigned long long)pool); + up_read(&osdc->map_sem); + return ret; +} + /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, (long)ci->i_rctime.tv_nsec); } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "ceph.dir.files", ceph_vxattrcb_files}, - { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, - { true, NULL, NULL } -}; -/* files */ +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) -{ - int ret; +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + .hidden = false, \ + .exists_cb = NULL, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + } - ret = snprintf(val, size, - "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (ceph_file_layout_pg_preferred(ci->i_layout)) - ret += snprintf(val + ret, size, "preferred_osd=%lld\n", - (unsigned long long)ceph_file_layout_pg_preferred( - ci->i_layout)); - return ret; -} +static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { .name = NULL, 0 } /* Required table terminator */ +}; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ + +/* files */ -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "ceph.file.layout", ceph_vxattrcb_layout}, - /* The following extended attribute name is deprecated */ - { true, "ceph.layout", ceph_vxattrcb_layout}, - { true, NULL, NULL } +static struct ceph_vxattr ceph_file_vxattrs[] = { + { + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + { .name = NULL, 0 } /* Required table terminator */ }; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { if (S_ISDIR(inode->i_mode)) return ceph_dir_vxattrs; @@ -126,22 +278,67 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG(); + + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + if (!vxattr->hidden) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { - do { - if (strcmp(vxattr->name, name) == 0) - return vxattr; - vxattr++; - } while (vxattr->name); + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + return NULL; } static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, - int dirty, - int should_free_name, int should_free_val, + int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { struct rb_node **p; @@ -170,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr = NULL; } + if (update_xattr) { + int err = 0; + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } + } + if (!xattr) { new = 1; xattr = *newxattr; xattr->name = name; xattr->name_len = name_len; - xattr->should_free_name = should_free_name; + xattr->should_free_name = update_xattr; ci->i_xattrs.count++; dout("__set_xattr count=%d\n", ci->i_xattrs.count); @@ -185,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (xattr->should_free_val) kfree((void *)xattr->val); - if (should_free_name) { + if (update_xattr) { kfree((void *)name); name = xattr->name; } @@ -200,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->val = ""; xattr->val_len = val_len; - xattr->dirty = dirty; - xattr->should_free_val = (val && should_free_val); + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); if (new) { rb_link_node(&xattr->node, parent, p); @@ -263,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr) { if (!xattr) - return -EOPNOTSUPP; + return -ENODATA; rb_erase(&xattr->node, &ci->i_xattrs.index); @@ -395,6 +611,7 @@ start: for (i = 0; i < numattr; i++) kfree(xattrs[i]); kfree(xattrs); + xattrs = NULL; goto start; } err = -EIO; @@ -408,7 +625,7 @@ start: p += len; err = __set_xattr(ci, name, namelen, val, len, - 0, 0, 0, &xattrs[numattr]); + 0, 0, &xattrs[numattr]); if (err < 0) goto bad; @@ -497,22 +714,23 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) } } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr_cb *vxattr = NULL; + struct ceph_vxattr *vxattr = NULL; if (!ceph_is_valid_xattr(name)) return -ENODATA; /* let's see if a virtual xattr was requested */ - if (vxattrs) - vxattr = ceph_match_vxattr(vxattrs, name); + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + return err; + } spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, @@ -531,11 +749,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, spin_lock(&ci->i_ceph_lock); - if (vxattr && vxattr->readonly) { - err = vxattr->getxattr_cb(ci, value, size); - goto out; - } - err = __build_xattrs(inode); if (err < 0) goto out; @@ -543,11 +756,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); - if (!xattr) { - if (vxattr) - err = vxattr->getxattr_cb(ci, value, size); + if (!xattr) goto out; - } err = -ERANGE; if (size && size < xattr->val_len) @@ -564,11 +774,20 @@ out: return err; } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(dentry->d_inode, name, value, size); +} + ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; u32 namelen; int err; @@ -596,29 +815,37 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) goto out; list_xattr: - vir_namelen = 0; - /* include virtual dir xattrs */ - if (vxattrs) - for (i = 0; vxattrs[i].name; i++) - vir_namelen += strlen(vxattrs[i].name) + 1; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + /* adding 1 byte per each variable due to the null termination */ - namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; - if (size && namelen > size) + if (size && vir_namelen + namelen > size) goto out; - err = namelen; + err = namelen + vir_namelen; if (size == 0) goto out; names = __copy_xattr_names(ci, names); /* virtual xattr names, too */ - if (vxattrs) + err = namelen; + if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; + if (!vxattrs[i].hidden && + !(vxattrs[i].exists_cb && + !vxattrs[i].exists_cb(ci))) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + err += len + 1; + } } + } out: spin_unlock(&ci->i_ceph_lock); @@ -631,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -660,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("setxattr value=%.*s\n", (int)size, value); + if (!value) + flags |= CEPH_XATTR_REMOVE; + /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -679,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -694,34 +921,32 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; int err; + int dirty = 0; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; - int issued; int required_blob_size; - int dirty; - - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; @@ -730,11 +955,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, goto out; if (val_len) { - newval = kmalloc(val_len + 1, GFP_NOFS); + newval = kmemdup(value, val_len, GFP_NOFS); if (!newval) goto out; - memcpy(newval, value, val_len); - newval[val_len] = '\0'; } xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); @@ -744,6 +967,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -752,7 +976,7 @@ retry: if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob = NULL; + struct ceph_buffer *blob; spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); @@ -766,12 +990,15 @@ retry: goto retry; } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); - err = __set_xattr(ci, newname, name_len, newval, - val_len, 1, 1, 1, &xattr); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); + + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + } + spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -779,6 +1006,7 @@ retry: do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -787,12 +1015,23 @@ out: return err; } +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + return __ceph_setxattr(dentry, name, value, size, flags); +} + static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -806,45 +1045,41 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); - parent_inode = ceph_get_dentry_parent_inode(dentry); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - iput(parent_inode); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; int required_blob_size; int dirty; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; err = -ENOMEM; spin_lock(&ci->i_ceph_lock); - __build_xattrs(inode); retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -865,18 +1100,29 @@ retry: } err = __remove_xattr_by_name(ceph_inode(inode), name); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); return err; do_sync: spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_send_removexattr(dentry, name); out: return err; } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} |
