diff options
Diffstat (limited to 'fs/ceph')
66 files changed, 7199 insertions, 15915 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 04b8280582a..264e9bf83ff 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -1,8 +1,11 @@ config CEPH_FS - tristate "Ceph distributed file system (EXPERIMENTAL)" - depends on INET && EXPERIMENTAL + tristate "Ceph distributed file system" + depends on INET + select CEPH_LIB select LIBCRC32C - select CONFIG_CRYPTO_AES + select CRYPTO_AES + select CRYPTO + default n help Choose Y or M here to include support for mounting the experimental Ceph distributed file system. Ceph is an extremely @@ -13,15 +16,25 @@ config CEPH_FS If unsure, say N. -config CEPH_FS_PRETTYDEBUG - bool "Include file:line in ceph debug output" +if CEPH_FS +config CEPH_FSCACHE + bool "Enable Ceph client caching support" + depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y + help + Choose Y here to enable persistent, read-only local + caching support for Ceph clients using FS-Cache + +endif + +config CEPH_FS_POSIX_ACL + bool "Ceph POSIX Access Control Lists" depends on CEPH_FS - default n + select FS_POSIX_ACL help - If you say Y here, debug output will include a filename and - line to aid debugging. This icnreases kernel size and slows - execution slightly when debug call sites are enabled (e.g., - via CONFIG_DYNAMIC_DEBUG). + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. - If unsure, say N. + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website <http://acl.bestbits.at/>. + If you don't know what Access Control Lists are, say N diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 6a660e610be..85a4230b9bf 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -2,38 +2,12 @@ # Makefile for CEPH filesystem. # -ifneq ($(KERNELRELEASE),) - obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ - messenger.o msgpool.o buffer.o pagelist.o \ - mds_client.o mdsmap.o \ - mon_client.o \ - osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ - debugfs.o \ - auth.o auth_none.o \ - crypto.o armor.o \ - auth_x.o \ - ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o - -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install - -clean: - $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + mds_client.o mdsmap.o strings.o ceph_frag.o \ + debugfs.o -endif +ceph-$(CONFIG_CEPH_FSCACHE) += cache.o +ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o diff --git a/fs/ceph/README b/fs/ceph/README deleted file mode 100644 index 18352fab37c..00000000000 --- a/fs/ceph/README +++ /dev/null @@ -1,20 +0,0 @@ -# -# The following files are shared by (and manually synchronized -# between) the Ceph userland and kernel client. -# -# userland kernel -src/include/ceph_fs.h fs/ceph/ceph_fs.h -src/include/ceph_fs.cc fs/ceph/ceph_fs.c -src/include/msgr.h fs/ceph/msgr.h -src/include/rados.h fs/ceph/rados.h -src/include/ceph_strings.cc fs/ceph/ceph_strings.c -src/include/ceph_frag.h fs/ceph/ceph_frag.h -src/include/ceph_frag.cc fs/ceph/ceph_frag.c -src/include/ceph_hash.h fs/ceph/ceph_hash.h -src/include/ceph_hash.cc fs/ceph/ceph_hash.c -src/crush/crush.c fs/ceph/crush/crush.c -src/crush/crush.h fs/ceph/crush/crush.h -src/crush/mapper.c fs/ceph/crush/mapper.c -src/crush/mapper.h fs/ceph/crush/mapper.h -src/crush/hash.h fs/ceph/crush/hash.h -src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c new file mode 100644 index 00000000000..469f2e8657e --- /dev/null +++ b/fs/ceph/acl.c @@ -0,0 +1,194 @@ +/* + * linux/fs/ceph/acl.c + * + * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include "super.h" + +static inline void ceph_set_cached_acl(struct inode *inode, + int type, struct posix_acl *acl) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + set_cached_acl(inode, type, acl); + spin_unlock(&ci->i_ceph_lock); +} + +static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, + int type) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct posix_acl *acl = ACL_NOT_CACHED; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) + acl = get_cached_acl(inode, type); + spin_unlock(&ci->i_ceph_lock); + + return acl; +} + +struct posix_acl *ceph_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + BUG(); + } + + size = __ceph_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __ceph_getxattr(inode, name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ERANGE || size == -ENODATA || size == 0) + acl = NULL; + else + acl = ERR_PTR(-EIO); + + kfree(value); + + if (!IS_ERR(acl)) + ceph_set_cached_acl(inode, type, acl); + + return acl; +} + +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret = 0, size = 0; + const char *name = NULL; + char *value = NULL; + struct iattr newattrs; + umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; + struct dentry *dentry; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &new_mode); + if (ret < 0) + goto out; + if (ret == 0) + acl = NULL; + } + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) { + ret = acl ? -EINVAL : 0; + goto out; + } + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + ret = -EINVAL; + goto out; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (ret < 0) + goto out_free; + } + + dentry = d_find_alias(inode); + if (new_mode != old_mode) { + newattrs.ia_mode = new_mode; + newattrs.ia_valid = ATTR_MODE; + ret = ceph_setattr(dentry, &newattrs); + if (ret) + goto out_dput; + } + + ret = __ceph_setxattr(dentry, name, value, size, 0); + if (ret) { + if (new_mode != old_mode) { + newattrs.ia_mode = old_mode; + newattrs.ia_valid = ATTR_MODE; + ceph_setattr(dentry, &newattrs); + } + goto out_dput; + } + + ceph_set_cached_acl(inode, type, acl); + +out_dput: + dput(dentry); +out_free: + kfree(value); +out: + return ret; +} + +int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int error; + + error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (error) + return error; + + if (!default_acl && !acl) + cache_no_acl(inode); + + if (default_acl) { + error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + posix_acl_release(default_acl); + } + if (acl) { + if (!error) + error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); + posix_acl_release(acl); + } + return error; +} diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d9c60b84949..90b3954d48e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> #include <linux/fs.h> @@ -10,7 +10,9 @@ #include <linux/task_io_accounting_ops.h> #include "super.h" -#include "osd_client.h" +#include "mds_client.h" +#include "cache.h" +#include <linux/ceph/osd_client.h> /* * Ceph address space ops. @@ -23,7 +25,7 @@ * context needs to be associated with the osd write during writeback. * * Similarly, struct ceph_inode_info maintains a set of counters to - * count dirty pages on the inode. In the absense of snapshots, + * count dirty pages on the inode. In the absence of snapshots, * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. * * When a snapshot is taken (that is, when the client receives @@ -53,7 +55,12 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) - +static inline struct ceph_snap_context *page_snap_context(struct page *page) +{ + if (PagePrivate(page)) + return (void *)page->private; + return NULL; +} /* * Dirty a page. Optimistically adjust accounting, on the assumption @@ -64,15 +71,16 @@ static int ceph_set_page_dirty(struct page *page) struct address_space *mapping = page->mapping; struct inode *inode; struct ceph_inode_info *ci; - int undo = 0; struct ceph_snap_context *snapc; + int ret; if (unlikely(!mapping)) return !TestSetPageDirty(page); - if (TestSetPageDirty(page)) { + if (PageDirty(page)) { dout("%p set_page_dirty %p idx %lu -- already dirty\n", mapping->host, page, page->index); + BUG_ON(!PagePrivate(page)); return 0; } @@ -86,12 +94,12 @@ static int ceph_set_page_dirty(struct page *page) snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); /* dirty the head */ - spin_lock(&inode->i_lock); - if (ci->i_wrbuffer_ref_head == 0) + spin_lock(&ci->i_ceph_lock); + if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; if (ci->i_wrbuffer_ref == 0) - igrab(inode); + ihold(inode); ++ci->i_wrbuffer_ref; dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " "snapc %p seq %lld (%d snaps)\n", @@ -99,43 +107,21 @@ static int ceph_set_page_dirty(struct page *page) ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, snapc, snapc->seq, snapc->num_snaps); - spin_unlock(&inode->i_lock); - - /* now adjust page */ - spin_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - WARN_ON_ONCE(!PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } - radix_tree_tag_set(&mapping->page_tree, - page_index(page), PAGECACHE_TAG_DIRTY); - - /* - * Reference snap context in page->private. Also set - * PagePrivate so that we get invalidatepage callback. - */ - page->private = (unsigned long)snapc; - SetPagePrivate(page); - } else { - dout("ANON set_page_dirty %p (raced truncate?)\n", page); - undo = 1; - } - - spin_unlock_irq(&mapping->tree_lock); + spin_unlock(&ci->i_ceph_lock); - if (undo) - /* whoops, we failed to dirty the page */ - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + BUG_ON(PagePrivate(page)); + page->private = (unsigned long)snapc; + SetPagePrivate(page); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + ret = __set_page_dirty_nobuffers(page); + WARN_ON(!PageLocked(page)); + WARN_ON(!page->mapping); - BUG_ON(!PageDirty(page)); - return 1; + return ret; } /* @@ -143,18 +129,26 @@ static int ceph_set_page_dirty(struct page *page) * dirty page counters appropriately. Only called if there is private * data on the page. */ -static void ceph_invalidatepage(struct page *page, unsigned long offset) +static void ceph_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = (void *)page->private; - - BUG_ON(!PageLocked(page)); - BUG_ON(!page->private); - BUG_ON(!PagePrivate(page)); - BUG_ON(!page->mapping); + struct ceph_snap_context *snapc = page_snap_context(page); inode = page->mapping->host; + ci = ceph_inode(inode); + + if (offset != 0 || length != PAGE_CACHE_SIZE) { + dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", + inode, page, page->index, offset, length); + return; + } + + ceph_invalidate_fscache_page(inode, page); + + if (!PagePrivate(page)) + return; /* * We can get non-dirty pages here due to races between @@ -164,32 +158,28 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset) if (!PageDirty(page)) pr_err("%p invalidatepage %p page not dirty\n", inode, page); - if (offset == 0) - ClearPageChecked(page); + ClearPageChecked(page); - ci = ceph_inode(inode); - if (offset == 0) { - dout("%p invalidatepage %p idx %lu full dirty page %lu\n", - inode, page, page->index, offset); - ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); - } else { - dout("%p invalidatepage %p idx %lu partial dirty page\n", - inode, page, page->index); - } + dout("%p invalidatepage %p idx %lu full dirty page\n", + inode, page, page->index); + + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); } -/* just a sanity check */ static int ceph_releasepage(struct page *page, gfp_t g) { struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", inode, page, page->index); WARN_ON(PageDirty(page)); - WARN_ON(page->private); - WARN_ON(PagePrivate(page)); - return 0; + + /* Can we release the page from the cache? */ + if (!ceph_release_fscache_page(page, g)) + return 0; + + return !PagePrivate(page); } /* @@ -197,28 +187,39 @@ static int ceph_releasepage(struct page *page, gfp_t g) */ static int readpage_nounlock(struct file *filp, struct page *page) { - struct inode *inode = filp->f_dentry->d_inode; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; int err = 0; u64 len = PAGE_CACHE_SIZE; + err = ceph_readpage_from_fscache(inode, page); + + if (err == 0) + goto out; + dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - page->index << PAGE_CACHE_SHIFT, &len, + (u64) page_offset(page), &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { SetPageError(page); + ceph_fscache_readpage_cancel(inode, page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } + if (err < PAGE_CACHE_SIZE) /* zero fill remainder of page */ zero_user_segment(page, err, PAGE_CACHE_SIZE); - } + else + flush_dcache_page(page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); out: return err < 0 ? err : 0; @@ -232,100 +233,180 @@ static int ceph_readpage(struct file *filp, struct page *page) } /* - * Build a vector of contiguous pages from the provided page list. + * Finish an async read(ahead) op. */ -static struct page **page_vector_from_list(struct list_head *page_list, - unsigned *nr_pages) +static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct page **pages; - struct page *page; - int next_index, contig_pages = 0; + struct inode *inode = req->r_inode; + struct ceph_osd_data *osd_data; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); + int num_pages; + int i; - /* build page vector */ - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - BUG_ON(list_empty(page_list)); - next_index = list_entry(page_list->prev, struct page, lru)->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index == next_index) { - dout("readpages page %d %p\n", contig_pages, page); - pages[contig_pages] = page; - contig_pages++; - next_index++; - } else { - break; + /* unlock all pages, zeroing any data we didn't read */ + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + for (i = 0; i < num_pages; i++) { + struct page *page = osd_data->pages[i]; + + if (rc < 0) + goto unlock; + if (bytes < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = bytes < 0 ? 0 : bytes; + zero_user_segment(page, s, PAGE_CACHE_SIZE); } + dout("finish_read %p uptodate %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + ceph_readpage_to_fscache(inode, page); +unlock: + unlock_page(page); + page_cache_release(page); + bytes -= PAGE_CACHE_SIZE; } - *nr_pages = contig_pages; - return pages; + kfree(osd_data->pages); +} + +static void ceph_unlock_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + unlock_page(pages[i]); } /* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. + * start an async read(ahead) operation. return nr_pages we submitted + * a read for on success, or negative error code. */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +static int start_read(struct inode *inode, struct list_head *page_list, int max) { - struct inode *inode = file->f_dentry->d_inode; + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; - int rc = 0; - struct page **pages; - loff_t offset; + struct page *page = list_entry(page_list->prev, struct page, lru); + struct ceph_vino vino; + struct ceph_osd_request *req; + u64 off; u64 len; + int i; + struct page **pages; + pgoff_t next_index; + int nr_pages = 0; + int ret; - dout("readpages %p file %p nr_pages %d\n", - inode, file, nr_pages); - - pages = page_vector_from_list(page_list, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); + off = (u64) page_offset(page); - /* guess read extent */ - offset = pages[0]->index << PAGE_CACHE_SHIFT; + /* count pages */ + next_index = page->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index != next_index) + break; + nr_pages++; + next_index++; + if (max && nr_pages == max) + break; + } len = nr_pages << PAGE_CACHE_SHIFT; - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - offset, &len, - ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); - if (rc == -ENOENT) - rc = 0; - if (rc < 0) - goto out; - - for (; !list_empty(page_list) && len > 0; - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { - struct page *page = - list_entry(page_list->prev, struct page, lru); + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, + off, len); + vino = ceph_vino(inode); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, + 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + /* build page vector */ + nr_pages = calc_pages_for(0, len); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + ret = -ENOMEM; + if (!pages) + goto out; + for (i = 0; i < nr_pages; ++i) { + page = list_entry(page_list->prev, struct page, lru); + BUG_ON(PageLocked(page)); list_del(&page->lru); - if (rc < (int)PAGE_CACHE_SIZE) { - /* zero (remainder of) page */ - int s = rc < 0 ? 0 : rc; - zero_user_segment(page, s, PAGE_CACHE_SIZE); - } - - if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { + dout("start_read %p adding %p idx %lu\n", inode, page, + page->index); + if (add_to_page_cache_lru(page, &inode->i_data, page->index, + GFP_NOFS)) { + ceph_fscache_uncache_page(inode, page); page_cache_release(page); - dout("readpages %p add_to_page_cache failed %p\n", + dout("start_read %p add_to_page_cache failed %p\n", inode, page); - continue; + nr_pages = i; + goto out_pages; } - dout("readpages %p adding %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); + pages[i] = page; } - rc = 0; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + req->r_callback = finish_read; + req->r_inode = inode; + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); + ret = ceph_osdc_start_request(osdc, req, false); + if (ret < 0) + goto out_pages; + ceph_osdc_put_request(req); + return nr_pages; + +out_pages: + ceph_unlock_page_vector(pages, nr_pages); + ceph_release_page_vector(pages, nr_pages); out: - kfree(pages); + ceph_osdc_put_request(req); + return ret; +} + + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int rc = 0; + int max = 0; + + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, + &nr_pages); + + if (rc == 0) + goto out; + + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) + >> PAGE_SHIFT; + + dout("readpages %p file %p nr_pages %d max %d\n", inode, + file, nr_pages, + max); + while (!list_empty(page_list)) { + rc = start_read(inode, page_list, max); + if (rc < 0) + goto out; + BUG_ON(rc == 0); + } +out: + ceph_fscache_readpages_cancel(inode, page_list); + + dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } @@ -340,7 +421,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -351,12 +432,12 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_head_snapc) { + if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return snapc; } @@ -370,15 +451,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_client *client; + struct ceph_fs_client *fsc; struct ceph_osd_client *osdc; - loff_t page_off = page->index << PAGE_CACHE_SHIFT; - int len = PAGE_CACHE_SIZE; - loff_t i_size; - int err = 0; struct ceph_snap_context *snapc, *oldest; - u64 snap_size = 0; + loff_t page_off = page_offset(page); long writeback_stat; + u64 truncate_size, snap_size = 0; + u32 truncate_seq; + int err = 0, len = PAGE_CACHE_SIZE; dout("writepage %p idx %lu\n", page, page->index); @@ -388,11 +468,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } inode = page->mapping->host; ci = ceph_inode(inode); - client = ceph_inode_to_client(inode); - osdc = &client->osdc; + fsc = ceph_inode_to_client(inode); + osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc == NULL) { dout("writepage %p page %p not dirty?\n", inode, page); goto out; @@ -400,7 +480,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) oldest = get_oldest_context(inode, &snap_size); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", - inode, page, (void *)page->private); + inode, page, snapc); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); ceph_put_snap_context(oldest); @@ -408,29 +488,37 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + /* is this a partial page at end of file? */ - if (snap_size) - i_size = snap_size; - else - i_size = i_size_read(inode); - if (i_size < page_off + len) - len = i_size - page_off; + if (page_off >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); + goto out; + } + if (snap_size < page_off + len) + len = snap_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u\n", - inode, page, page->index, page_off, len); + dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", + inode, page, page->index, page_off, len, snapc); - writeback_stat = atomic_long_inc_return(&client->writeback_count); + writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > - CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) + set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + + ceph_readpage_to_fscache(inode, page); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, - ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, - &page, 1, 0, 0, true); + truncate_seq, truncate_size, + &inode->i_mtime, &page, 1); if (err < 0) { dout("writepage setting page/mapping error %d %p\n", err, page); SetPageError(page); @@ -455,7 +543,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); - igrab(inode); + ihold(inode); err = writepage_nounlock(page, wbc); unlock_page(page); iput(inode); @@ -480,7 +568,6 @@ static void ceph_release_pages(struct page **pages, int num) pagevec_release(&pvec); } - /* * async writeback completion handler. * @@ -491,27 +578,24 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_data *osd_data; unsigned wrote; struct page *page; + int num_pages; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - __s32 rc = -EIO; - u64 bytes = 0; - struct ceph_client *client = ceph_inode_to_client(inode); + int rc = req->r_result; + u64 bytes = req->r_ops[0].extent.length; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - + osd_data = osd_req_op_extent_osd_data(req, 0); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -519,7 +603,7 @@ static void writepages_finish(struct ceph_osd_request *req, * raced with a truncation and was adjusted at the osd, * so don't believe the reply. */ - wrote = req->r_num_pages; + wrote = num_pages; } else { wrote = 0; mapping_set_error(mapping, rc); @@ -528,19 +612,19 @@ static void writepages_finish(struct ceph_osd_request *req, inode, rc, bytes, wrote); /* clean all pages */ - for (i = 0; i < req->r_num_pages; i++) { - page = req->r_pages[i]; + for (i = 0; i < num_pages; i++) { + page = osd_data->pages[i]; BUG_ON(!page); WARN_ON(!PageUptodate(page)); writeback_stat = - atomic_long_dec_return(&client->writeback_count); + atomic_long_dec_return(&fsc->writeback_count); if (writeback_stat < - CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) - clear_bdi_congested(&client->backing_dev_info, + CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) + clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_put_snap_context((void *)page->private); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); dout("unlocking %d %p\n", i, page); @@ -552,50 +636,33 @@ static void writepages_finish(struct ceph_osd_request *req, * page truncation thread, possibly losing some data that * raced its way in */ - if ((issued & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) generic_error_remove_page(inode->i_mapping, page); unlock_page(page); } dout("%p wrote+cleaned %d pages\n", inode, wrote); - ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); + ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); - ceph_release_pages(req->r_pages, req->r_num_pages); - if (req->r_pages_from_pool) - mempool_free(req->r_pages, + ceph_release_pages(osd_data->pages, num_pages); + if (osd_data->pages_from_pool) + mempool_free(osd_data->pages, ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else - kfree(req->r_pages); + kfree(osd_data->pages); ceph_osdc_put_request(req); } /* - * allocate a page vec, either directly, or if necessary, via a the - * mempool. we avoid the mempool if we can because req->r_num_pages - * may be less than the maximum write size. - */ -static void alloc_page_vec(struct ceph_client *client, - struct ceph_osd_request *req) -{ - req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, - GFP_NOFS); - if (!req->r_pages) { - req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); - req->r_pages_from_pool = 1; - WARN_ON(!req->r_pages); - } -} - -/* * initiate async writeback */ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_vino vino = ceph_vino(inode); pgoff_t index, start, end; int range_whole = 0; int should_loop = 1; @@ -607,41 +674,34 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync; - u64 snap_size = 0; + u64 truncate_size, snap_size; + u32 truncate_seq; /* * Include a 'sync' in the OSD request if this is a data * integrity write (e.g., O_SYNC write or fsync()), or if our * cap is being revoked. */ - do_sync = wbc->sync_mode == WB_SYNC_ALL; - if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) + if ((wbc->sync_mode == WB_SYNC_ALL) || + ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) do_sync = 1; dout("writepages_start %p dosync=%d (mode=%s)\n", inode, do_sync, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - client = ceph_inode_to_client(inode); - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { - pr_warning("writepage_start %p on forced umount\n", inode); + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + pr_warn("writepage_start %p on forced umount\n", inode); return -EIO; /* we're in a forced umount, don't write! */ } - if (client->mount_args->wsize && client->mount_args->wsize < wsize) - wsize = client->mount_args->wsize; + if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + wsize = fsc->mount_options->wsize; if (wsize < PAGE_CACHE_SIZE) wsize = PAGE_CACHE_SIZE; max_pages_ever = wsize >> PAGE_CACHE_SHIFT; pagevec_init(&pvec, 0); - /* ?? */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - dout(" writepages congested\n"); - wbc->encountered_congestion = 1; - goto out_final; - } - /* where to start/end? */ if (wbc->range_cyclic) { start = mapping->writeback_index; /* Start from prev offset */ @@ -660,6 +720,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); + snap_size = 0; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -667,8 +728,18 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } + if (snap_size == 0) + snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); + + spin_lock(&ci->i_ceph_lock); + truncate_seq = ci->i_truncate_seq; + truncate_size = ci->i_truncate_size; + if (!snap_size) + snap_size = i_size_read(inode); + spin_unlock(&ci->i_ceph_lock); + if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the * start of the original file range. */ @@ -679,15 +750,16 @@ retry: last_snapc = snapc; while (!done && index <= end) { + int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; int pvec_pages, locked_pages; + struct page **pages = NULL; + mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; int want; u64 offset, len; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; long writeback_stat; next = 0; @@ -736,11 +808,8 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if ((snap_size && page_offset(page) > snap_size) || - (!snap_size && - page_offset(page) > i_size_read(inode))) { - dout("%p page eof %llu\n", page, snap_size ? - snap_size : i_size_read(inode)); + if (page_offset(page) >= snap_size) { + dout("%p page eof %llu\n", page, snap_size); done = 1; unlock_page(page); break; @@ -752,7 +821,7 @@ get_more_pages: } /* only if matching snap context */ - pgsnapc = (void *)page->private; + pgsnapc = page_snap_context(page); if (pgsnapc->seq > snapc->seq) { dout("page snapc %p %lld > oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); @@ -768,27 +837,42 @@ get_more_pages: break; } - /* ok */ + /* + * We have something to write. If this is + * the first locked page this time through, + * allocate an osd request and a page array + * that it will use. + */ if (locked_pages == 0) { + BUG_ON(pages); /* prepare async write request */ - offset = page->index << PAGE_CACHE_SHIFT; + offset = (u64)page_offset(page); len = wsize; - req = ceph_osdc_new_request(&client->osdc, - &ci->i_layout, - ceph_vino(inode), - offset, &len, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ONDISK, - snapc, do_sync, - ci->i_truncate_seq, - ci->i_truncate_size, - &inode->i_mtime, true, 1); - max_pages = req->r_num_pages; - - alloc_page_vec(client, req); + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, num_ops, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + snapc, truncate_seq, + truncate_size, true); + if (IS_ERR(req)) { + rc = PTR_ERR(req); + unlock_page(page); + break; + } + req->r_callback = writepages_finish; req->r_inode = inode; + + max_pages = calc_pages_for(0, (u64)len); + pages = kmalloc(max_pages * sizeof (*pages), + GFP_NOFS); + if (!pages) { + pool = fsc->wb_pagevec_pool; + pages = mempool_alloc(pool, GFP_NOFS); + BUG_ON(!pages); + } } /* note position of first page in pvec */ @@ -797,13 +881,16 @@ get_more_pages: dout("%p will write page %p idx %lu\n", inode, page, page->index); - writeback_stat = atomic_long_inc_return(&client->writeback_count); - if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { - set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + writeback_stat = + atomic_long_inc_return(&fsc->writeback_count); + if (writeback_stat > CONGESTION_ON_THRESH( + fsc->mount_options->congestion_kb)) { + set_bdi_congested(&fsc->backing_dev_info, + BLK_RW_ASYNC); } set_page_writeback(page); - req->r_pages[locked_pages] = page; + pages[locked_pages] = page; locked_pages++; next = page->index + 1; } @@ -832,22 +919,30 @@ get_more_pages: pvec.nr -= i-first; } - /* submit the write */ - offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; - len = min((snap_size ? snap_size : i_size_read(inode)) - offset, + /* Format the osd request message and submit the write */ + + offset = page_offset(pages[0]); + len = min(snap_size - offset, (u64)locked_pages << PAGE_CACHE_SHIFT); dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); - /* revise final length, page count */ - req->r_num_pages = locked_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->extent.length = cpu_to_le64(len); - op->payload_len = cpu_to_le32(len); - req->r_request->hdr.data_len = cpu_to_le32(len); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + !!pool, false); + + pages = NULL; /* request message now owns the pages array */ + pool = NULL; - ceph_osdc_start_request(&client->osdc, req, true); + /* Update the write op length in case we changed it */ + + osd_req_op_extent_update(req, 0, len); + + vino = ceph_vino(inode); + ceph_osdc_build_request(req, offset, snapc, vino.snap, + &inode->i_mtime); + + rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); + BUG_ON(rc); req = NULL; /* continue? */ @@ -879,11 +974,8 @@ release_pvec_pages: out: if (req) ceph_osdc_put_request(req); - if (rc > 0) - rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); -out_final: return rc; } @@ -914,9 +1006,9 @@ static int ceph_update_writeable_page(struct file *file, loff_t pos, unsigned len, struct page *page) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -932,7 +1024,7 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - snapc = (void *)page->private; + snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap @@ -1023,7 +1115,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct page *page; pgoff_t index = pos >> PAGE_CACHE_SHIFT; int r; @@ -1036,7 +1128,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); + inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); } while (r == -EAGAIN); @@ -1053,9 +1145,9 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = file->f_dentry->d_inode; - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1092,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * never get called. */ static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - const struct iovec *iov, - loff_t pos, unsigned long nr_segs) + struct iov_iter *iter, + loff_t pos) { WARN_ON(1); return -EINVAL; @@ -1116,27 +1208,83 @@ const struct address_space_operations ceph_aops = { /* * vm ops */ +static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; + int want, got, ret; + + dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", + inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); + + ret = filemap_fault(vma, vmf); + + dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + + return ret; +} /* * Reuse write_begin here for simplicity. */ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct inode *inode = file_inode(vma->vm_file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_info *fi = vma->vm_file->private_data; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct page *page = vmf->page; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; - loff_t off = page->index << PAGE_CACHE_SHIFT; - loff_t size, len; - int ret; + loff_t off = page_offset(page); + loff_t size = i_size_read(inode); + size_t len; + int want, got, ret; - size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else len = size & ~PAGE_CACHE_MASK; - dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, - off, len, page, page->index); + dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", + inode, ceph_vinop(inode), off, len, size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + while (1) { + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + if (ret == 0) + break; + if (ret != -ERESTARTSYS) { + WARN_ON(1); + return VM_FAULT_SIGBUS; + } + } + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", + inode, off, len, ceph_cap_string(got)); + + /* Update time before taking page lock */ + file_update_time(vma->vm_file); lock_page(page); @@ -1158,15 +1306,28 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); - if (ret != VM_FAULT_LOCKED) + if (ret != VM_FAULT_LOCKED) { unlock_page(page); + } else { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + inode, off, len, ceph_cap_string(got), ret); + ceph_put_cap_refs(ci, got); + return ret; } static struct vm_operations_struct ceph_vmops = { - .fault = filemap_fault, + .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) @@ -1177,6 +1338,5 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) return -ENOEXEC; file_accessed(file); vma->vm_ops = &ceph_vmops; - vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c deleted file mode 100644 index 67b2c030924..00000000000 --- a/fs/ceph/armor.c +++ /dev/null @@ -1,99 +0,0 @@ - -#include <linux/errno.h> - -/* - * base64 encode/decode. - */ - -const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - -static int encode_bits(int c) -{ - return pem_key[c]; -} - -static int decode_bits(char c) -{ - if (c >= 'A' && c <= 'Z') - return c - 'A'; - if (c >= 'a' && c <= 'z') - return c - 'a' + 26; - if (c >= '0' && c <= '9') - return c - '0' + 52; - if (c == '+') - return 62; - if (c == '/') - return 63; - if (c == '=') - return 0; /* just non-negative, please */ - return -EINVAL; -} - -int ceph_armor(char *dst, const char *src, const char *end) -{ - int olen = 0; - int line = 0; - - while (src < end) { - unsigned char a, b, c; - - a = *src++; - *dst++ = encode_bits(a >> 2); - if (src < end) { - b = *src++; - *dst++ = encode_bits(((a & 3) << 4) | (b >> 4)); - if (src < end) { - c = *src++; - *dst++ = encode_bits(((b & 15) << 2) | - (c >> 6)); - *dst++ = encode_bits(c & 63); - } else { - *dst++ = encode_bits((b & 15) << 2); - *dst++ = '='; - } - } else { - *dst++ = encode_bits(((a & 3) << 4)); - *dst++ = '='; - *dst++ = '='; - } - olen += 4; - line += 4; - if (line == 64) { - line = 0; - *(dst++) = '\n'; - olen++; - } - } - return olen; -} - -int ceph_unarmor(char *dst, const char *src, const char *end) -{ - int olen = 0; - - while (src < end) { - int a, b, c, d; - - if (src < end && src[0] == '\n') - src++; - if (src + 4 > end) - return -EINVAL; - a = decode_bits(src[0]); - b = decode_bits(src[1]); - c = decode_bits(src[2]); - d = decode_bits(src[3]); - if (a < 0 || b < 0 || c < 0 || d < 0) - return -EINVAL; - - *dst++ = (a << 2) | (b >> 4); - if (src[2] == '=') - return olen + 1; - *dst++ = ((b & 15) << 4) | (c >> 2); - if (src[3] == '=') - return olen + 2; - *dst++ = ((c & 3) << 6) | d; - olen += 3; - src += 4; - } - return olen; -} diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c deleted file mode 100644 index 89490beaf53..00000000000 --- a/fs/ceph/auth.c +++ /dev/null @@ -1,259 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/module.h> -#include <linux/err.h> -#include <linux/slab.h> - -#include "types.h" -#include "auth_none.h" -#include "auth_x.h" -#include "decode.h" -#include "super.h" - -#include "messenger.h" - -/* - * get protocol handler - */ -static u32 supported_protocols[] = { - CEPH_AUTH_NONE, - CEPH_AUTH_CEPHX -}; - -int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) -{ - switch (protocol) { - case CEPH_AUTH_NONE: - return ceph_auth_none_init(ac); - case CEPH_AUTH_CEPHX: - return ceph_x_init(ac); - default: - return -ENOENT; - } -} - -/* - * setup, teardown. - */ -struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret) -{ - struct ceph_auth_client *ac; - int ret; - - dout("auth_init name '%s' secret '%s'\n", name, secret); - - ret = -ENOMEM; - ac = kzalloc(sizeof(*ac), GFP_NOFS); - if (!ac) - goto out; - - ac->negotiating = true; - if (name) - ac->name = name; - else - ac->name = CEPH_AUTH_NAME_DEFAULT; - dout("auth_init name %s secret %s\n", ac->name, secret); - ac->secret = secret; - return ac; - -out: - return ERR_PTR(ret); -} - -void ceph_auth_destroy(struct ceph_auth_client *ac) -{ - dout("auth_destroy %p\n", ac); - if (ac->ops) - ac->ops->destroy(ac); - kfree(ac); -} - -/* - * Reset occurs when reconnecting to the monitor. - */ -void ceph_auth_reset(struct ceph_auth_client *ac) -{ - dout("auth_reset %p\n", ac); - if (ac->ops && !ac->negotiating) - ac->ops->reset(ac); - ac->negotiating = true; -} - -int ceph_entity_name_encode(const char *name, void **p, void *end) -{ - int len = strlen(name); - - if (*p + 2*sizeof(u32) + len > end) - return -ERANGE; - ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); - ceph_encode_32(p, len); - ceph_encode_copy(p, name, len); - return 0; -} - -/* - * Initiate protocol negotiation with monitor. Include entity name - * and list supported protocols. - */ -int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) -{ - struct ceph_mon_request_header *monhdr = buf; - void *p = monhdr + 1, *end = buf + len, *lenp; - int i, num; - int ret; - - dout("auth_build_hello\n"); - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, 0); /* no protocol, yet */ - - lenp = p; - p += sizeof(u32); - - ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - ceph_encode_8(&p, 1); - num = ARRAY_SIZE(supported_protocols); - ceph_encode_32(&p, num); - ceph_decode_need(&p, end, num * sizeof(u32), bad); - for (i = 0; i < num; i++) - ceph_encode_32(&p, supported_protocols[i]); - - ret = ceph_entity_name_encode(ac->name, &p, end); - if (ret < 0) - return ret; - ceph_decode_need(&p, end, sizeof(u64), bad); - ceph_encode_64(&p, ac->global_id); - - ceph_encode_32(&lenp, p - lenp - sizeof(u32)); - return p - buf; - -bad: - return -ERANGE; -} - -int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) -{ - struct ceph_mon_request_header *monhdr = msg_buf; - void *p = monhdr + 1; - void *end = msg_buf + msg_len; - int ret; - - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, ac->protocol); - - ret = ac->ops->build_request(ac, p + sizeof(u32), end); - if (ret < 0) { - pr_err("error %d building auth method %s request\n", ret, - ac->ops->name); - return ret; - } - dout(" built request %d bytes\n", ret); - ceph_encode_32(&p, ret); - return p + ret - msg_buf; -} - -/* - * Handle auth message from monitor. - */ -int ceph_handle_auth_reply(struct ceph_auth_client *ac, - void *buf, size_t len, - void *reply_buf, size_t reply_len) -{ - void *p = buf; - void *end = buf + len; - int protocol; - s32 result; - u64 global_id; - void *payload, *payload_end; - int payload_len; - char *result_msg; - int result_msg_len; - int ret = -EINVAL; - - dout("handle_auth_reply %p %p\n", p, end); - ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); - protocol = ceph_decode_32(&p); - result = ceph_decode_32(&p); - global_id = ceph_decode_64(&p); - payload_len = ceph_decode_32(&p); - payload = p; - p += payload_len; - ceph_decode_need(&p, end, sizeof(u32), bad); - result_msg_len = ceph_decode_32(&p); - result_msg = p; - p += result_msg_len; - if (p != end) - goto bad; - - dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, - result_msg, global_id, payload_len); - - payload_end = payload + payload_len; - - if (global_id && ac->global_id != global_id) { - dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); - ac->global_id = global_id; - } - - if (ac->negotiating) { - /* server does not support our protocols? */ - if (!protocol && result < 0) { - ret = result; - goto out; - } - /* set up (new) protocol handler? */ - if (ac->protocol && ac->protocol != protocol) { - ac->ops->destroy(ac); - ac->protocol = 0; - ac->ops = NULL; - } - if (ac->protocol != protocol) { - ret = ceph_auth_init_protocol(ac, protocol); - if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); - goto out; - } - } - - ac->negotiating = false; - } - - ret = ac->ops->handle_reply(ac, result, payload, payload_end); - if (ret == -EAGAIN) { - return ceph_build_auth_request(ac, reply_buf, reply_len); - } else if (ret) { - pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - return ret; - } - return 0; - -bad: - pr_err("failed to decode auth msg\n"); -out: - return ret; -} - -int ceph_build_auth(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) -{ - if (!ac->protocol) - return ceph_auth_build_hello(ac, msg_buf, msg_len); - BUG_ON(!ac->ops); - if (ac->ops->should_authenticate(ac)) - return ceph_build_auth_request(ac, msg_buf, msg_len); - return 0; -} - -int ceph_auth_is_authenticated(struct ceph_auth_client *ac) -{ - if (!ac->ops) - return 0; - return ac->ops->is_authenticated(ac); -} diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h deleted file mode 100644 index d38a2fb4a13..00000000000 --- a/fs/ceph/auth.h +++ /dev/null @@ -1,92 +0,0 @@ -#ifndef _FS_CEPH_AUTH_H -#define _FS_CEPH_AUTH_H - -#include "types.h" -#include "buffer.h" - -/* - * Abstract interface for communicating with the authenticate module. - * There is some handshake that takes place between us and the monitor - * to acquire the necessary keys. These are used to generate an - * 'authorizer' that we use when connecting to a service (mds, osd). - */ - -struct ceph_auth_client; -struct ceph_authorizer; - -struct ceph_auth_client_ops { - const char *name; - - /* - * true if we are authenticated and can connect to - * services. - */ - int (*is_authenticated)(struct ceph_auth_client *ac); - - /* - * true if we should (re)authenticate, e.g., when our tickets - * are getting old and crusty. - */ - int (*should_authenticate)(struct ceph_auth_client *ac); - - /* - * build requests and process replies during monitor - * handshake. if handle_reply returns -EAGAIN, we build - * another request. - */ - int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); - int (*handle_reply)(struct ceph_auth_client *ac, int result, - void *buf, void *end); - - /* - * Create authorizer for connecting to a service, and verify - * the response to authenticate the service. - */ - int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len); - int (*verify_authorizer_reply)(struct ceph_auth_client *ac, - struct ceph_authorizer *a, size_t len); - void (*destroy_authorizer)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); - void (*invalidate_authorizer)(struct ceph_auth_client *ac, - int peer_type); - - /* reset when we (re)connect to a monitor */ - void (*reset)(struct ceph_auth_client *ac); - - void (*destroy)(struct ceph_auth_client *ac); -}; - -struct ceph_auth_client { - u32 protocol; /* CEPH_AUTH_* */ - void *private; /* for use by protocol implementation */ - const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */ - - bool negotiating; /* true if negotiating protocol */ - const char *name; /* entity name */ - u64 global_id; /* our unique id in system */ - const char *secret; /* our secret key */ - unsigned want_keys; /* which services we want */ -}; - -extern struct ceph_auth_client *ceph_auth_init(const char *name, - const char *secret); -extern void ceph_auth_destroy(struct ceph_auth_client *ac); - -extern void ceph_auth_reset(struct ceph_auth_client *ac); - -extern int ceph_auth_build_hello(struct ceph_auth_client *ac, - void *buf, size_t len); -extern int ceph_handle_auth_reply(struct ceph_auth_client *ac, - void *buf, size_t len, - void *reply_buf, size_t reply_len); -extern int ceph_entity_name_encode(const char *name, void **p, void *end); - -extern int ceph_build_auth(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len); - -extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); - -#endif diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c deleted file mode 100644 index ad1dc21286c..00000000000 --- a/fs/ceph/auth_none.c +++ /dev/null @@ -1,131 +0,0 @@ - -#include "ceph_debug.h" - -#include <linux/err.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> - -#include "auth_none.h" -#include "auth.h" -#include "decode.h" - -static void reset(struct ceph_auth_client *ac) -{ - struct ceph_auth_none_info *xi = ac->private; - - xi->starting = true; - xi->built_authorizer = false; -} - -static void destroy(struct ceph_auth_client *ac) -{ - kfree(ac->private); - ac->private = NULL; -} - -static int is_authenticated(struct ceph_auth_client *ac) -{ - struct ceph_auth_none_info *xi = ac->private; - - return !xi->starting; -} - -static int should_authenticate(struct ceph_auth_client *ac) -{ - struct ceph_auth_none_info *xi = ac->private; - - return xi->starting; -} - -/* - * the generic auth code decode the global_id, and we carry no actual - * authenticate state, so nothing happens here. - */ -static int handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) -{ - struct ceph_auth_none_info *xi = ac->private; - - xi->starting = false; - return result; -} - -/* - * build an 'authorizer' with our entity_name and global_id. we can - * reuse a single static copy since it is identical for all services - * we connect to. - */ -static int ceph_auth_none_create_authorizer( - struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) -{ - struct ceph_auth_none_info *ai = ac->private; - struct ceph_none_authorizer *au = &ai->au; - void *p, *end; - int ret; - - if (!ai->built_authorizer) { - p = au->buf; - end = p + sizeof(au->buf); - ceph_encode_8(&p, 1); - ret = ceph_entity_name_encode(ac->name, &p, end - 8); - if (ret < 0) - goto bad; - ceph_decode_need(&p, end, sizeof(u64), bad2); - ceph_encode_64(&p, ac->global_id); - au->buf_len = p - (void *)au->buf; - ai->built_authorizer = true; - dout("built authorizer len %d\n", au->buf_len); - } - - *a = (struct ceph_authorizer *)au; - *buf = au->buf; - *len = au->buf_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); - return 0; - -bad2: - ret = -ERANGE; -bad: - return ret; -} - -static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - /* nothing to do */ -} - -static const struct ceph_auth_client_ops ceph_auth_none_ops = { - .name = "none", - .reset = reset, - .destroy = destroy, - .is_authenticated = is_authenticated, - .should_authenticate = should_authenticate, - .handle_reply = handle_reply, - .create_authorizer = ceph_auth_none_create_authorizer, - .destroy_authorizer = ceph_auth_none_destroy_authorizer, -}; - -int ceph_auth_none_init(struct ceph_auth_client *ac) -{ - struct ceph_auth_none_info *xi; - - dout("ceph_auth_none_init %p\n", ac); - xi = kzalloc(sizeof(*xi), GFP_NOFS); - if (!xi) - return -ENOMEM; - - xi->starting = true; - xi->built_authorizer = false; - - ac->protocol = CEPH_AUTH_NONE; - ac->private = xi; - ac->ops = &ceph_auth_none_ops; - return 0; -} - diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h deleted file mode 100644 index 8164df1a08b..00000000000 --- a/fs/ceph/auth_none.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _FS_CEPH_AUTH_NONE_H -#define _FS_CEPH_AUTH_NONE_H - -#include <linux/slab.h> - -#include "auth.h" - -/* - * null security mode. - * - * we use a single static authorizer that simply encodes our entity name - * and global id. - */ - -struct ceph_none_authorizer { - char buf[128]; - int buf_len; - char reply_buf[0]; -}; - -struct ceph_auth_none_info { - bool starting; - bool built_authorizer; - struct ceph_none_authorizer au; /* we only need one; it's static */ -}; - -extern int ceph_auth_none_init(struct ceph_auth_client *ac); - -#endif - diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c deleted file mode 100644 index 6d44053ecff..00000000000 --- a/fs/ceph/auth_x.c +++ /dev/null @@ -1,684 +0,0 @@ - -#include "ceph_debug.h" - -#include <linux/err.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> - -#include "auth_x.h" -#include "auth_x_protocol.h" -#include "crypto.h" -#include "auth.h" -#include "decode.h" - -#define TEMP_TICKET_BUF_LEN 256 - -static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); - -static int ceph_x_is_authenticated(struct ceph_auth_client *ac) -{ - struct ceph_x_info *xi = ac->private; - int need; - - ceph_x_validate_tickets(ac, &need); - dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return (ac->want_keys & xi->have_keys) == ac->want_keys; -} - -static int ceph_x_should_authenticate(struct ceph_auth_client *ac) -{ - struct ceph_x_info *xi = ac->private; - int need; - - ceph_x_validate_tickets(ac, &need); - dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return need != 0; -} - -static int ceph_x_encrypt_buflen(int ilen) -{ - return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + - sizeof(u32); -} - -static int ceph_x_encrypt(struct ceph_crypto_key *secret, - void *ibuf, int ilen, void *obuf, size_t olen) -{ - struct ceph_x_encrypt_header head = { - .struct_v = 1, - .magic = cpu_to_le64(CEPHX_ENC_MAGIC) - }; - size_t len = olen - sizeof(u32); - int ret; - - ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, - &head, sizeof(head), ibuf, ilen); - if (ret) - return ret; - ceph_encode_32(&obuf, len); - return len + sizeof(u32); -} - -static int ceph_x_decrypt(struct ceph_crypto_key *secret, - void **p, void *end, void *obuf, size_t olen) -{ - struct ceph_x_encrypt_header head; - size_t head_len = sizeof(head); - int len, ret; - - len = ceph_decode_32(p); - if (*p + len > end) - return -EINVAL; - - dout("ceph_x_decrypt len %d\n", len); - ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen, - *p, len); - if (ret) - return ret; - if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) - return -EPERM; - *p += len; - return olen; -} - -/* - * get existing (or insert new) ticket handler - */ -struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, - int service) -{ - struct ceph_x_ticket_handler *th; - struct ceph_x_info *xi = ac->private; - struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; - - while (*p) { - parent = *p; - th = rb_entry(parent, struct ceph_x_ticket_handler, node); - if (service < th->service) - p = &(*p)->rb_left; - else if (service > th->service) - p = &(*p)->rb_right; - else - return th; - } - - /* add it */ - th = kzalloc(sizeof(*th), GFP_NOFS); - if (!th) - return ERR_PTR(-ENOMEM); - th->service = service; - rb_link_node(&th->node, parent, p); - rb_insert_color(&th->node, &xi->ticket_handlers); - return th; -} - -static void remove_ticket_handler(struct ceph_auth_client *ac, - struct ceph_x_ticket_handler *th) -{ - struct ceph_x_info *xi = ac->private; - - dout("remove_ticket_handler %p %d\n", th, th->service); - rb_erase(&th->node, &xi->ticket_handlers); - ceph_crypto_key_destroy(&th->session_key); - if (th->ticket_blob) - ceph_buffer_put(th->ticket_blob); - kfree(th); -} - -static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, - struct ceph_crypto_key *secret, - void *buf, void *end) -{ - struct ceph_x_info *xi = ac->private; - int num; - void *p = buf; - int ret; - char *dbuf; - char *ticket_buf; - u8 reply_struct_v; - - dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!dbuf) - return -ENOMEM; - - ret = -ENOMEM; - ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); - if (!ticket_buf) - goto out_dbuf; - - ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - reply_struct_v = ceph_decode_8(&p); - if (reply_struct_v != 1) - goto bad; - num = ceph_decode_32(&p); - dout("%d tickets\n", num); - while (num--) { - int type; - u8 tkt_struct_v, blob_struct_v; - struct ceph_x_ticket_handler *th; - void *dp, *dend; - int dlen; - char is_enc; - struct timespec validity; - struct ceph_crypto_key old_key; - void *tp, *tpend; - struct ceph_timespec new_validity; - struct ceph_crypto_key new_session_key; - struct ceph_buffer *new_ticket_blob; - unsigned long new_expires, new_renew_after; - u64 new_secret_id; - - ceph_decode_need(&p, end, sizeof(u32) + 1, bad); - - type = ceph_decode_32(&p); - dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - - tkt_struct_v = ceph_decode_8(&p); - if (tkt_struct_v != 1) - goto bad; - - th = get_ticket_handler(ac, type); - if (IS_ERR(th)) { - ret = PTR_ERR(th); - goto out; - } - - /* blob for me */ - dlen = ceph_x_decrypt(secret, &p, end, dbuf, - TEMP_TICKET_BUF_LEN); - if (dlen <= 0) { - ret = dlen; - goto out; - } - dout(" decrypted %d bytes\n", dlen); - dend = dbuf + dlen; - dp = dbuf; - - tkt_struct_v = ceph_decode_8(&dp); - if (tkt_struct_v != 1) - goto bad; - - memcpy(&old_key, &th->session_key, sizeof(old_key)); - ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); - if (ret) - goto out; - - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); - new_expires = get_seconds() + validity.tv_sec; - new_renew_after = new_expires - (validity.tv_sec / 4); - dout(" expires=%lu renew_after=%lu\n", new_expires, - new_renew_after); - - /* ticket blob for service */ - ceph_decode_8_safe(&p, end, is_enc, bad); - tp = ticket_buf; - if (is_enc) { - /* encrypted */ - dout(" encrypted ticket\n"); - dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf, - TEMP_TICKET_BUF_LEN); - if (dlen < 0) { - ret = dlen; - goto out; - } - dlen = ceph_decode_32(&tp); - } else { - /* unencrypted */ - ceph_decode_32_safe(&p, end, dlen, bad); - ceph_decode_need(&p, end, dlen, bad); - ceph_decode_copy(&p, ticket_buf, dlen); - } - tpend = tp + dlen; - dout(" ticket blob is %d bytes\n", dlen); - ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - blob_struct_v = ceph_decode_8(&tp); - new_secret_id = ceph_decode_64(&tp); - ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); - if (ret) - goto out; - - /* all is well, update our ticket */ - ceph_crypto_key_destroy(&th->session_key); - if (th->ticket_blob) - ceph_buffer_put(th->ticket_blob); - th->session_key = new_session_key; - th->ticket_blob = new_ticket_blob; - th->validity = new_validity; - th->secret_id = new_secret_id; - th->expires = new_expires; - th->renew_after = new_renew_after; - dout(" got ticket service %d (%s) secret_id %lld len %d\n", - type, ceph_entity_type_name(type), th->secret_id, - (int)th->ticket_blob->vec.iov_len); - xi->have_keys |= th->service; - } - - ret = 0; -out: - kfree(ticket_buf); -out_dbuf: - kfree(dbuf); - return ret; - -bad: - ret = -EINVAL; - goto out; -} - -static int ceph_x_build_authorizer(struct ceph_auth_client *ac, - struct ceph_x_ticket_handler *th, - struct ceph_x_authorizer *au) -{ - int maxlen; - struct ceph_x_authorize_a *msg_a; - struct ceph_x_authorize_b msg_b; - void *p, *end; - int ret; - int ticket_blob_len = - (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); - - dout("build_authorizer for %s %p\n", - ceph_entity_type_name(th->service), au); - - maxlen = sizeof(*msg_a) + sizeof(msg_b) + - ceph_x_encrypt_buflen(ticket_blob_len); - dout(" need len %d\n", maxlen); - if (au->buf && au->buf->alloc_len < maxlen) { - ceph_buffer_put(au->buf); - au->buf = NULL; - } - if (!au->buf) { - au->buf = ceph_buffer_new(maxlen, GFP_NOFS); - if (!au->buf) - return -ENOMEM; - } - au->service = th->service; - - msg_a = au->buf->vec.iov_base; - msg_a->struct_v = 1; - msg_a->global_id = cpu_to_le64(ac->global_id); - msg_a->service_id = cpu_to_le32(th->service); - msg_a->ticket_blob.struct_v = 1; - msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); - msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); - if (ticket_blob_len) { - memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, - th->ticket_blob->vec.iov_len); - } - dout(" th %p secret_id %lld %lld\n", th, th->secret_id, - le64_to_cpu(msg_a->ticket_blob.secret_id)); - - p = msg_a + 1; - p += ticket_blob_len; - end = au->buf->vec.iov_base + au->buf->vec.iov_len; - - get_random_bytes(&au->nonce, sizeof(au->nonce)); - msg_b.struct_v = 1; - msg_b.nonce = cpu_to_le64(au->nonce); - ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), - p, end - p); - if (ret < 0) - goto out_buf; - p += ret; - au->buf->vec.iov_len = p - au->buf->vec.iov_base; - dout(" built authorizer nonce %llx len %d\n", au->nonce, - (int)au->buf->vec.iov_len); - BUG_ON(au->buf->vec.iov_len > maxlen); - return 0; - -out_buf: - ceph_buffer_put(au->buf); - au->buf = NULL; - return ret; -} - -static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, - void **p, void *end) -{ - ceph_decode_need(p, end, 1 + sizeof(u64), bad); - ceph_encode_8(p, 1); - ceph_encode_64(p, th->secret_id); - if (th->ticket_blob) { - const char *buf = th->ticket_blob->vec.iov_base; - u32 len = th->ticket_blob->vec.iov_len; - - ceph_encode_32_safe(p, end, len, bad); - ceph_encode_copy_safe(p, end, buf, len, bad); - } else { - ceph_encode_32_safe(p, end, 0, bad); - } - - return 0; -bad: - return -ERANGE; -} - -static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) -{ - int want = ac->want_keys; - struct ceph_x_info *xi = ac->private; - int service; - - *pneed = ac->want_keys & ~(xi->have_keys); - - for (service = 1; service <= want; service <<= 1) { - struct ceph_x_ticket_handler *th; - - if (!(ac->want_keys & service)) - continue; - - if (*pneed & service) - continue; - - th = get_ticket_handler(ac, service); - - if (!th) { - *pneed |= service; - continue; - } - - if (get_seconds() >= th->renew_after) - *pneed |= service; - if (get_seconds() >= th->expires) - xi->have_keys &= ~service; - } -} - - -static int ceph_x_build_request(struct ceph_auth_client *ac, - void *buf, void *end) -{ - struct ceph_x_info *xi = ac->private; - int need; - struct ceph_x_request_header *head = buf; - int ret; - struct ceph_x_ticket_handler *th = - get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); - - ceph_x_validate_tickets(ac, &need); - - dout("build_request want %x have %x need %x\n", - ac->want_keys, xi->have_keys, need); - - if (need & CEPH_ENTITY_TYPE_AUTH) { - struct ceph_x_authenticate *auth = (void *)(head + 1); - void *p = auth + 1; - struct ceph_x_challenge_blob tmp; - char tmp_enc[40]; - u64 *u; - - if (p > end) - return -ERANGE; - - dout(" get_auth_session_key\n"); - head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); - - /* encrypt and hash */ - get_random_bytes(&auth->client_challenge, sizeof(u64)); - tmp.client_challenge = auth->client_challenge; - tmp.server_challenge = cpu_to_le64(xi->server_challenge); - ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), - tmp_enc, sizeof(tmp_enc)); - if (ret < 0) - return ret; - - auth->struct_v = 1; - auth->key = 0; - for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) - auth->key ^= *u; - dout(" server_challenge %llx client_challenge %llx key %llx\n", - xi->server_challenge, le64_to_cpu(auth->client_challenge), - le64_to_cpu(auth->key)); - - /* now encode the old ticket if exists */ - ret = ceph_x_encode_ticket(th, &p, end); - if (ret < 0) - return ret; - - return p - buf; - } - - if (need) { - void *p = head + 1; - struct ceph_x_service_ticket_request *req; - - if (p > end) - return -ERANGE; - head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - - BUG_ON(!th); - ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); - if (ret) - return ret; - ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, - xi->auth_authorizer.buf->vec.iov_len); - - req = p; - req->keys = cpu_to_le32(need); - p += sizeof(*req); - return p - buf; - } - - return 0; -} - -static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) -{ - struct ceph_x_info *xi = ac->private; - struct ceph_x_reply_header *head = buf; - struct ceph_x_ticket_handler *th; - int len = end - buf; - int op; - int ret; - - if (result) - return result; /* XXX hmm? */ - - if (xi->starting) { - /* it's a hello */ - struct ceph_x_server_challenge *sc = buf; - - if (len != sizeof(*sc)) - return -EINVAL; - xi->server_challenge = le64_to_cpu(sc->server_challenge); - dout("handle_reply got server challenge %llx\n", - xi->server_challenge); - xi->starting = false; - xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; - return -EAGAIN; - } - - op = le16_to_cpu(head->op); - result = le32_to_cpu(head->result); - dout("handle_reply op %d result %d\n", op, result); - switch (op) { - case CEPHX_GET_AUTH_SESSION_KEY: - /* verify auth key */ - ret = ceph_x_proc_ticket_reply(ac, &xi->secret, - buf + sizeof(*head), end); - break; - - case CEPHX_GET_PRINCIPAL_SESSION_KEY: - th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); - BUG_ON(!th); - ret = ceph_x_proc_ticket_reply(ac, &th->session_key, - buf + sizeof(*head), end); - break; - - default: - return -EINVAL; - } - if (ret) - return ret; - if (ac->want_keys == xi->have_keys) - return 0; - return -EAGAIN; -} - -static int ceph_x_create_authorizer( - struct ceph_auth_client *ac, int peer_type, - struct ceph_authorizer **a, - void **buf, size_t *len, - void **reply_buf, size_t *reply_len) -{ - struct ceph_x_authorizer *au; - struct ceph_x_ticket_handler *th; - int ret; - - th = get_ticket_handler(ac, peer_type); - if (IS_ERR(th)) - return PTR_ERR(th); - - au = kzalloc(sizeof(*au), GFP_NOFS); - if (!au) - return -ENOMEM; - - ret = ceph_x_build_authorizer(ac, th, au); - if (ret) { - kfree(au); - return ret; - } - - *a = (struct ceph_authorizer *)au; - *buf = au->buf->vec.iov_base; - *len = au->buf->vec.iov_len; - *reply_buf = au->reply_buf; - *reply_len = sizeof(au->reply_buf); - return 0; -} - -static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a, size_t len) -{ - struct ceph_x_authorizer *au = (void *)a; - struct ceph_x_ticket_handler *th; - int ret = 0; - struct ceph_x_authorize_reply reply; - void *p = au->reply_buf; - void *end = p + sizeof(au->reply_buf); - - th = get_ticket_handler(ac, au->service); - if (!th) - return -EIO; /* hrm! */ - ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); - if (ret < 0) - return ret; - if (ret != sizeof(reply)) - return -EPERM; - - if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) - ret = -EPERM; - else - ret = 0; - dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", - au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); - return ret; -} - -static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - struct ceph_x_authorizer *au = (void *)a; - - ceph_buffer_put(au->buf); - kfree(au); -} - - -static void ceph_x_reset(struct ceph_auth_client *ac) -{ - struct ceph_x_info *xi = ac->private; - - dout("reset\n"); - xi->starting = true; - xi->server_challenge = 0; -} - -static void ceph_x_destroy(struct ceph_auth_client *ac) -{ - struct ceph_x_info *xi = ac->private; - struct rb_node *p; - - dout("ceph_x_destroy %p\n", ac); - ceph_crypto_key_destroy(&xi->secret); - - while ((p = rb_first(&xi->ticket_handlers)) != NULL) { - struct ceph_x_ticket_handler *th = - rb_entry(p, struct ceph_x_ticket_handler, node); - remove_ticket_handler(ac, th); - } - - if (xi->auth_authorizer.buf) - ceph_buffer_put(xi->auth_authorizer.buf); - - kfree(ac->private); - ac->private = NULL; -} - -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) -{ - struct ceph_x_ticket_handler *th; - - th = get_ticket_handler(ac, peer_type); - if (th && !IS_ERR(th)) - remove_ticket_handler(ac, th); -} - - -static const struct ceph_auth_client_ops ceph_x_ops = { - .name = "x", - .is_authenticated = ceph_x_is_authenticated, - .should_authenticate = ceph_x_should_authenticate, - .build_request = ceph_x_build_request, - .handle_reply = ceph_x_handle_reply, - .create_authorizer = ceph_x_create_authorizer, - .verify_authorizer_reply = ceph_x_verify_authorizer_reply, - .destroy_authorizer = ceph_x_destroy_authorizer, - .invalidate_authorizer = ceph_x_invalidate_authorizer, - .reset = ceph_x_reset, - .destroy = ceph_x_destroy, -}; - - -int ceph_x_init(struct ceph_auth_client *ac) -{ - struct ceph_x_info *xi; - int ret; - - dout("ceph_x_init %p\n", ac); - ret = -ENOMEM; - xi = kzalloc(sizeof(*xi), GFP_NOFS); - if (!xi) - goto out; - - ret = -EINVAL; - if (!ac->secret) { - pr_err("no secret set (for auth_x protocol)\n"); - goto out_nomem; - } - - ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); - if (ret) - goto out_nomem; - - xi->starting = true; - xi->ticket_handlers = RB_ROOT; - - ac->protocol = CEPH_AUTH_CEPHX; - ac->private = xi; - ac->ops = &ceph_x_ops; - return 0; - -out_nomem: - kfree(xi); -out: - return ret; -} - - diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h deleted file mode 100644 index ff6f8180e68..00000000000 --- a/fs/ceph/auth_x.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _FS_CEPH_AUTH_X_H -#define _FS_CEPH_AUTH_X_H - -#include <linux/rbtree.h> - -#include "crypto.h" -#include "auth.h" -#include "auth_x_protocol.h" - -/* - * Handle ticket for a single service. - */ -struct ceph_x_ticket_handler { - struct rb_node node; - unsigned service; - - struct ceph_crypto_key session_key; - struct ceph_timespec validity; - - u64 secret_id; - struct ceph_buffer *ticket_blob; - - unsigned long renew_after, expires; -}; - - -struct ceph_x_authorizer { - struct ceph_buffer *buf; - unsigned service; - u64 nonce; - char reply_buf[128]; /* big enough for encrypted blob */ -}; - -struct ceph_x_info { - struct ceph_crypto_key secret; - - bool starting; - u64 server_challenge; - - unsigned have_keys; - struct rb_root ticket_handlers; - - struct ceph_x_authorizer auth_authorizer; -}; - -extern int ceph_x_init(struct ceph_auth_client *ac); - -#endif - diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h deleted file mode 100644 index 671d30576c4..00000000000 --- a/fs/ceph/auth_x_protocol.h +++ /dev/null @@ -1,90 +0,0 @@ -#ifndef __FS_CEPH_AUTH_X_PROTOCOL -#define __FS_CEPH_AUTH_X_PROTOCOL - -#define CEPHX_GET_AUTH_SESSION_KEY 0x0100 -#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 -#define CEPHX_GET_ROTATING_KEY 0x0400 - -/* common bits */ -struct ceph_x_ticket_blob { - __u8 struct_v; - __le64 secret_id; - __le32 blob_len; - char blob[]; -} __attribute__ ((packed)); - - -/* common request/reply headers */ -struct ceph_x_request_header { - __le16 op; -} __attribute__ ((packed)); - -struct ceph_x_reply_header { - __le16 op; - __le32 result; -} __attribute__ ((packed)); - - -/* authenticate handshake */ - -/* initial hello (no reply header) */ -struct ceph_x_server_challenge { - __u8 struct_v; - __le64 server_challenge; -} __attribute__ ((packed)); - -struct ceph_x_authenticate { - __u8 struct_v; - __le64 client_challenge; - __le64 key; - /* ticket blob */ -} __attribute__ ((packed)); - -struct ceph_x_service_ticket_request { - __u8 struct_v; - __le32 keys; -} __attribute__ ((packed)); - -struct ceph_x_challenge_blob { - __le64 server_challenge; - __le64 client_challenge; -} __attribute__ ((packed)); - - - -/* authorize handshake */ - -/* - * The authorizer consists of two pieces: - * a - service id, ticket blob - * b - encrypted with session key - */ -struct ceph_x_authorize_a { - __u8 struct_v; - __le64 global_id; - __le32 service_id; - struct ceph_x_ticket_blob ticket_blob; -} __attribute__ ((packed)); - -struct ceph_x_authorize_b { - __u8 struct_v; - __le64 nonce; -} __attribute__ ((packed)); - -struct ceph_x_authorize_reply { - __u8 struct_v; - __le64 nonce_plus_one; -} __attribute__ ((packed)); - - -/* - * encyption bundle - */ -#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull - -struct ceph_x_encrypt_header { - __u8 struct_v; - __le64 magic; -} __attribute__ ((packed)); - -#endif diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c deleted file mode 100644 index c67535d70aa..00000000000 --- a/fs/ceph/buffer.c +++ /dev/null @@ -1,81 +0,0 @@ - -#include "ceph_debug.h" - -#include <linux/slab.h> - -#include "buffer.h" -#include "decode.h" - -struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) -{ - struct ceph_buffer *b; - - b = kmalloc(sizeof(*b), gfp); - if (!b) - return NULL; - - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); - if (!b->vec.iov_base) { - kfree(b); - return NULL; - } - b->is_vmalloc = true; - } - - kref_init(&b->kref); - b->alloc_len = len; - b->vec.iov_len = len; - dout("buffer_new %p\n", b); - return b; -} - -void ceph_buffer_release(struct kref *kref) -{ - struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); - - dout("buffer_release %p\n", b); - if (b->vec.iov_base) { - if (b->is_vmalloc) - vfree(b->vec.iov_base); - else - kfree(b->vec.iov_base); - } - kfree(b); -} - -int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp) -{ - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL); - b->is_vmalloc = true; - } - if (!b->vec.iov_base) - return -ENOMEM; - b->alloc_len = len; - b->vec.iov_len = len; - return 0; -} - -int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) -{ - size_t len; - - ceph_decode_need(p, end, sizeof(u32), bad); - len = ceph_decode_32(p); - dout("decode_buffer len %d\n", (int)len); - ceph_decode_need(p, end, len, bad); - *b = ceph_buffer_new(len, GFP_NOFS); - if (!*b) - return -ENOMEM; - ceph_decode_copy(p, (*b)->vec.iov_base, len); - return 0; -bad: - return -EINVAL; -} diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h deleted file mode 100644 index 58d19014068..00000000000 --- a/fs/ceph/buffer.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __FS_CEPH_BUFFER_H -#define __FS_CEPH_BUFFER_H - -#include <linux/kref.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/types.h> -#include <linux/uio.h> - -/* - * a simple reference counted buffer. - * - * use kmalloc for small sizes (<= one page), vmalloc for larger - * sizes. - */ -struct ceph_buffer { - struct kref kref; - struct kvec vec; - size_t alloc_len; - bool is_vmalloc; -}; - -extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); -extern void ceph_buffer_release(struct kref *kref); - -static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b) -{ - kref_get(&b->kref); - return b; -} - -static inline void ceph_buffer_put(struct ceph_buffer *b) -{ - kref_put(&b->kref, ceph_buffer_release); -} - -extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end); - -#endif diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c new file mode 100644 index 00000000000..834f9f3723f --- /dev/null +++ b/fs/ceph/cache.c @@ -0,0 +1,402 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include "super.h" +#include "cache.h" + +struct ceph_aux_inode { + struct timespec mtime; + loff_t size; +}; + +struct fscache_netfs ceph_cache_netfs = { + .name = "ceph", + .version = 0, +}; + +static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_fs_client* fsc = cookie_netfs_data; + uint16_t klen; + + klen = sizeof(fsc->client->fsid); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &fsc->client->fsid, klen); + return klen; +} + +static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { + .name = "CEPH.fsid", + .type = FSCACHE_COOKIE_TYPE_INDEX, + .get_key = ceph_fscache_session_get_key, +}; + +int ceph_fscache_register(void) +{ + return fscache_register_netfs(&ceph_cache_netfs); +} + +void ceph_fscache_unregister(void) +{ + fscache_unregister_netfs(&ceph_cache_netfs); +} + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, + &ceph_fscache_fsid_object_def, + fsc, true); + + if (fsc->fscache == NULL) { + pr_err("Unable to resgister fsid: %p fscache cookie", fsc); + return 0; + } + + fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); + if (fsc->revalidate_wq == NULL) + return -ENOMEM; + + return 0; +} + +static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, + void *buffer, uint16_t maxbuf) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + uint16_t klen; + + /* use ceph virtual inode (id + snaphot) */ + klen = sizeof(ci->i_vino); + if (klen > maxbuf) + return 0; + + memcpy(buffer, &ci->i_vino, klen); + return klen; +} + +static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, + void *buffer, uint16_t bufmax) +{ + struct ceph_aux_inode aux; + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + memcpy(buffer, &aux, sizeof(aux)); + + return sizeof(aux); +} + +static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, + uint64_t *size) +{ + const struct ceph_inode_info* ci = cookie_netfs_data; + const struct inode* inode = &ci->vfs_inode; + + *size = inode->i_size; +} + +static enum fscache_checkaux ceph_fscache_inode_check_aux( + void *cookie_netfs_data, const void *data, uint16_t dlen) +{ + struct ceph_aux_inode aux; + struct ceph_inode_info* ci = cookie_netfs_data; + struct inode* inode = &ci->vfs_inode; + + if (dlen != sizeof(aux)) + return FSCACHE_CHECKAUX_OBSOLETE; + + memset(&aux, 0, sizeof(aux)); + aux.mtime = inode->i_mtime; + aux.size = inode->i_size; + + if (memcmp(data, &aux, sizeof(aux)) != 0) + return FSCACHE_CHECKAUX_OBSOLETE; + + dout("ceph inode 0x%p cached okay", ci); + return FSCACHE_CHECKAUX_OKAY; +} + +static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) +{ + struct ceph_inode_info* ci = cookie_netfs_data; + struct pagevec pvec; + pgoff_t first; + int loop, nr_pages; + + pagevec_init(&pvec, 0); + first = 0; + + dout("ceph inode 0x%p now uncached", ci); + + while (1) { + nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, + PAGEVEC_SIZE - pagevec_count(&pvec)); + + if (!nr_pages) + break; + + for (loop = 0; loop < nr_pages; loop++) + ClearPageFsCache(pvec.pages[loop]); + + first = pvec.pages[nr_pages - 1]->index + 1; + + pvec.nr = nr_pages; + pagevec_release(&pvec); + cond_resched(); + } +} + +static const struct fscache_cookie_def ceph_fscache_inode_object_def = { + .name = "CEPH.inode", + .type = FSCACHE_COOKIE_TYPE_DATAFILE, + .get_key = ceph_fscache_inode_get_key, + .get_attr = ceph_fscache_inode_get_attr, + .get_aux = ceph_fscache_inode_get_aux, + .check_aux = ceph_fscache_inode_check_aux, + .now_uncached = ceph_fscache_inode_now_uncached, +}; + +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci) +{ + struct inode* inode = &ci->vfs_inode; + + /* No caching for filesystem */ + if (fsc->fscache == NULL) + return; + + /* Only cache for regular files that are read only */ + if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + return; + + /* Avoid multiple racing open requests */ + mutex_lock(&inode->i_mutex); + + if (ci->fscache) + goto done; + + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, true); + fscache_check_consistency(ci->fscache); +done: + mutex_unlock(&inode->i_mutex); + +} + +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ + struct fscache_cookie* cookie; + + if ((cookie = ci->fscache) == NULL) + return; + + ci->fscache = NULL; + + fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); + fscache_relinquish_cookie(cookie, 0); +} + +static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); +} + +static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +{ + if (!error) + SetPageUptodate(page); + + unlock_page(page); +} + +static inline int cache_valid(struct ceph_inode_info *ci) +{ + return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && + (ci->i_fscache_gen == ci->i_rdcache_gen)); +} + + +/* Atempt to read from the fscache, + * + * This function is called from the readpage_nounlock context. DO NOT attempt to + * unlock the page here (or in the callback). + */ +int ceph_readpage_from_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_page(ci->fscache, page, + ceph_vfs_readpage_complete, NULL, + GFP_KERNEL); + + switch (ret) { + case 0: /* Page found */ + dout("page read submitted\n"); + return 0; + case -ENOBUFS: /* Pages were not found, and can't be */ + case -ENODATA: /* Pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!cache_valid(ci)) + return -ENOBUFS; + + ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, + ceph_vfs_readpage_complete_unlock, + NULL, mapping_gfp_mask(mapping)); + + switch (ret) { + case 0: /* All pages found */ + dout("all-page read submitted\n"); + return 0; + case -ENOBUFS: /* Some pages were not found, and can't be */ + case -ENODATA: /* some pages were not found */ + dout("page/inode not in cache\n"); + return ret; + default: + dout("%s: unknown error ret = %i\n", __func__, ret); + return ret; + } +} + +void ceph_readpage_to_fscache(struct inode *inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!PageFsCache(page)) + return; + + if (!cache_valid(ci)) + return; + + ret = fscache_write_page(ci->fscache, page, GFP_KERNEL); + if (ret) + fscache_uncache_page(ci->fscache, page); +} + +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!PageFsCache(page)) + return; + + fscache_wait_on_page_write(ci->fscache, page); + fscache_uncache_page(ci->fscache, page); +} + +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ + if (fsc->revalidate_wq) + destroy_workqueue(fsc->revalidate_wq); + + fscache_relinquish_cookie(fsc->fscache, 0); + fsc->fscache = NULL; +} + +static void ceph_revalidate_work(struct work_struct *work) +{ + int issued; + u32 orig_gen; + struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, + i_revalidate_work); + struct inode *inode = &ci->vfs_inode; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + orig_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); + + if (!(issued & CEPH_CAP_FILE_CACHE)) { + dout("revalidate_work lost cache before validation %p\n", + inode); + goto out; + } + + if (!fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + + spin_lock(&ci->i_ceph_lock); + /* Update the new valid generation (backwards sanity check too) */ + if (orig_gen > ci->i_fscache_gen) { + ci->i_fscache_gen = orig_gen; + } + spin_unlock(&ci->i_ceph_lock); + +out: + iput(&ci->vfs_inode); +} + +void ceph_queue_revalidate(struct inode *inode) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + return; + + ihold(inode); + + if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, + &ci->i_revalidate_work)) { + dout("ceph_queue_revalidate %p\n", inode); + } else { + dout("ceph_queue_revalidate %p failed\n)", inode); + iput(inode); + } +} + +void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + /* The first load is verifed cookie open time */ + ci->i_fscache_gen = 1; + INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); +} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h new file mode 100644 index 00000000000..5ac591bd012 --- /dev/null +++ b/fs/ceph/cache.h @@ -0,0 +1,182 @@ +/* + * Ceph cache definitions. + * + * Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved. + * Written by Milosz Tanski (milosz@adfin.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#ifndef _CEPH_CACHE_H +#define _CEPH_CACHE_H + +#ifdef CONFIG_CEPH_FSCACHE + +extern struct fscache_netfs ceph_cache_netfs; + +int ceph_fscache_register(void); +void ceph_fscache_unregister(void); + +int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); + +void ceph_fscache_inode_init(struct ceph_inode_info *ci); +void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, + struct ceph_inode_info* ci); +void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); + +int ceph_readpage_from_fscache(struct inode *inode, struct page *page); +int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages); +void ceph_readpage_to_fscache(struct inode *inode, struct page *page); +void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); +void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + fscache_attr_changed(ci->fscache); +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ + fscache_invalidate(ceph_inode(inode)->fscache); +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_uncache_page(ci->fscache, page); +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + struct inode* inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_maybe_release_page(ci->fscache, page, gfp); +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) + __fscache_uncache_page(ci->fscache, page); +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + return fscache_readpages_cancel(ci->fscache, pages); +} + +#else + +static inline int ceph_fscache_register(void) +{ + return 0; +} + +static inline void ceph_fscache_unregister(void) +{ +} + +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +{ + return 0; +} + +static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) +{ +} + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ +} + +static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, + struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_uncache_page(struct inode *inode, + struct page *pages) +{ +} + +static inline int ceph_readpage_from_fscache(struct inode* inode, + struct page *page) +{ + return -ENOBUFS; +} + +static inline int ceph_readpages_from_fscache(struct inode *inode, + struct address_space *mapping, + struct list_head *pages, + unsigned *nr_pages) +{ + return -ENOBUFS; +} + +static inline void ceph_readpage_to_fscache(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_update_objectsize(struct inode *inode) +{ +} + +static inline void ceph_fscache_invalidate(struct inode *inode) +{ +} + +static inline void ceph_invalidate_fscache_page(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +{ + return 1; +} + +static inline void ceph_fscache_readpage_cancel(struct inode *inode, + struct page *page) +{ +} + +static inline void ceph_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + +static inline void ceph_queue_revalidate(struct inode *inode) +{ +} + +#endif + +#endif diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 74144d6389f..1fde164b74b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -9,8 +9,10 @@ #include <linux/writeback.h> #include "super.h" -#include "decode.h" -#include "messenger.h" +#include "mds_client.h" +#include "cache.h" +#include <linux/ceph/decode.h> +#include <linux/ceph/messenger.h> /* * Capability management @@ -113,133 +115,114 @@ const char *ceph_cap_string(int caps) return cap_str[i]; } -/* - * Cap reservations - * - * Maintain a global pool of preallocated struct ceph_caps, referenced - * by struct ceph_caps_reservations. This ensures that we preallocate - * memory needed to successfully process an MDS response. (If an MDS - * sends us cap information and we fail to process it, we will have - * problems due to the client and MDS being out of sync.) - * - * Reservations are 'owned' by a ceph_cap_reservation context. - */ -static spinlock_t caps_list_lock; -static struct list_head caps_list; /* unused (reserved or unreserved) */ -static int caps_total_count; /* total caps allocated */ -static int caps_use_count; /* in use */ -static int caps_reserve_count; /* unused, reserved */ -static int caps_avail_count; /* unused, unreserved */ -static int caps_min_count; /* keep at least this many (unreserved) */ - -void __init ceph_caps_init(void) +void ceph_caps_init(struct ceph_mds_client *mdsc) { - INIT_LIST_HEAD(&caps_list); - spin_lock_init(&caps_list_lock); + INIT_LIST_HEAD(&mdsc->caps_list); + spin_lock_init(&mdsc->caps_list_lock); } -void ceph_caps_finalize(void) +void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; - spin_lock(&caps_list_lock); - while (!list_empty(&caps_list)) { - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + spin_lock(&mdsc->caps_list_lock); + while (!list_empty(&mdsc->caps_list)) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } - caps_total_count = 0; - caps_avail_count = 0; - caps_use_count = 0; - caps_reserve_count = 0; - caps_min_count = 0; - spin_unlock(&caps_list_lock); + mdsc->caps_total_count = 0; + mdsc->caps_avail_count = 0; + mdsc->caps_use_count = 0; + mdsc->caps_reserve_count = 0; + mdsc->caps_min_count = 0; + spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(int delta) +void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) { - spin_lock(&caps_list_lock); - caps_min_count += delta; - BUG_ON(caps_min_count < 0); - spin_unlock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_min_count += delta; + BUG_ON(mdsc->caps_min_count < 0); + spin_unlock(&mdsc->caps_list_lock); } -int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) +void ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need) { int i; struct ceph_cap *cap; int have; int alloc = 0; LIST_HEAD(newcaps); - int ret = 0; dout("reserve caps ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ - spin_lock(&caps_list_lock); - if (caps_avail_count >= need) + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_avail_count >= need) have = need; else - have = caps_avail_count; - caps_avail_count -= have; - caps_reserve_count += have; - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + have = mdsc->caps_avail_count; + mdsc->caps_avail_count -= have; + mdsc->caps_reserve_count += have; + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; i++) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); - if (!cap) { - ret = -ENOMEM; - goto out_alloc_count; - } + if (!cap) + break; list_add(&cap->caps_item, &newcaps); alloc++; } - BUG_ON(have + alloc != need); + /* we didn't manage to reserve as much as we needed */ + if (have + alloc != need) + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", + ctx, need, have + alloc); - spin_lock(&caps_list_lock); - caps_total_count += alloc; - caps_reserve_count += alloc; - list_splice(&newcaps, &caps_list); + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_total_count += alloc; + mdsc->caps_reserve_count += alloc; + list_splice(&newcaps, &mdsc->caps_list); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); ctx->count = need; dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", - ctx, caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); - return 0; - -out_alloc_count: - /* we didn't manage to reserve as much as we needed */ - pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n", - ctx, need, have); - return ret; + ctx, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); } -int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) +int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); if (ctx->count) { - spin_lock(&caps_list_lock); - BUG_ON(caps_reserve_count < ctx->count); - caps_reserve_count -= ctx->count; - caps_avail_count += ctx->count; + spin_lock(&mdsc->caps_list_lock); + BUG_ON(mdsc->caps_reserve_count < ctx->count); + mdsc->caps_reserve_count -= ctx->count; + mdsc->caps_avail_count += ctx->count; ctx->count = 0; dout("unreserve caps %d = %d used + %d resv + %d avail\n", - caps_total_count, caps_use_count, caps_reserve_count, - caps_avail_count); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } return 0; } -static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) +struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx) { struct ceph_cap *cap = NULL; @@ -247,77 +230,82 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { - caps_use_count++; - caps_total_count++; + spin_lock(&mdsc->caps_list_lock); + mdsc->caps_use_count++; + mdsc->caps_total_count++; + spin_unlock(&mdsc->caps_list_lock); } return cap; } - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", - ctx, ctx->count, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); + ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); - BUG_ON(ctx->count > caps_reserve_count); - BUG_ON(list_empty(&caps_list)); + BUG_ON(ctx->count > mdsc->caps_reserve_count); + BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; - caps_reserve_count--; - caps_use_count++; + mdsc->caps_reserve_count--; + mdsc->caps_use_count++; - cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); + cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); return cap; } -void ceph_put_cap(struct ceph_cap *cap) +void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { - spin_lock(&caps_list_lock); + spin_lock(&mdsc->caps_list_lock); dout("put_cap %p %d = %d used + %d resv + %d avail\n", - cap, caps_total_count, caps_use_count, - caps_reserve_count, caps_avail_count); - caps_use_count--; + cap, mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ - if (caps_avail_count >= caps_reserve_count + caps_min_count) { - caps_total_count--; + if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + + mdsc->caps_min_count) { + mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { - caps_avail_count++; - list_add(&cap->caps_item, &caps_list); + mdsc->caps_avail_count++; + list_add(&cap->caps_item, &mdsc->caps_list); } - BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + - caps_avail_count); - spin_unlock(&caps_list_lock); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + mdsc->caps_avail_count); + spin_unlock(&mdsc->caps_list_lock); } -void ceph_reservation_status(struct ceph_client *client, +void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { + struct ceph_mds_client *mdsc = fsc->mdsc; + if (total) - *total = caps_total_count; + *total = mdsc->caps_total_count; if (avail) - *avail = caps_avail_count; + *avail = mdsc->caps_avail_count; if (used) - *used = caps_use_count; + *used = mdsc->caps_use_count; if (reserved) - *reserved = caps_reserve_count; + *reserved = mdsc->caps_reserve_count; if (min) - *min = caps_min_count; + *min = mdsc->caps_min_count; } /* * Find ceph_cap for given mds, if any. * - * Called with i_lock held. + * Called with i_ceph_lock held. */ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { @@ -336,22 +324,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) return NULL; } +struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) +{ + struct ceph_cap *cap; + + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + spin_unlock(&ci->i_ceph_lock); + return cap; +} + /* - * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else - * -1. + * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) +static int __ceph_get_cap_mds(struct ceph_inode_info *ci) { struct ceph_cap *cap; int mds = -1; struct rb_node *p; - /* prefer mds with WR|WRBUFFER|EXCL caps */ + /* prefer mds with WR|BUFFER|EXCL caps */ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); mds = cap->mds; - if (mseq) - *mseq = cap->mseq; if (cap->issued & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_EXCL)) @@ -362,15 +357,16 @@ static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) int ceph_get_cap_mds(struct inode *inode) { + struct ceph_inode_info *ci = ceph_inode(inode); int mds; - spin_lock(&inode->i_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + mds = __ceph_get_cap_mds(ceph_inode(inode)); + spin_unlock(&ci->i_ceph_lock); return mds; } /* - * Called under i_lock. + * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) @@ -401,7 +397,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_args *ma = mdsc->client->mount_args; + struct ceph_mount_options *ma = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + ma->caps_wanted_delay_min * HZ); @@ -416,7 +412,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * * If I_FLUSH is set, leave the inode at the front of the list. * - * Caller holds i_lock + * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, @@ -458,7 +454,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, /* * Cancel delayed work on cap. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) @@ -483,12 +479,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & CEPH_CAP_FILE_CACHE) && - (had & CEPH_CAP_FILE_CACHE) == 0) + if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; + } /* - * if we are newly issued FILE_SHARED, clear I_COMPLETE; we + * if we are newly issued FILE_SHARED, mark dir not complete; we * don't know what happened to this directory while we didn't * have the cap. */ @@ -497,7 +494,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + __ceph_dir_clear_complete(ci); } } } @@ -511,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ -int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned seq, unsigned mseq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation) +void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned seq, unsigned mseq, u64 realmino, int flags, + struct ceph_cap **new_cap) { - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *new_cap = NULL; struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; @@ -534,42 +530,44 @@ int ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); -retry: - spin_lock(&inode->i_lock); cap = __get_cap_for_mds(ci, mds); if (!cap) { - if (new_cap) { - cap = new_cap; - new_cap = NULL; - } else { - spin_unlock(&inode->i_lock); - new_cap = get_cap(caps_reservation); - if (new_cap == NULL) - return -ENOMEM; - goto retry; - } + cap = *new_cap; + *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; + cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); - /* clear out old exporting info? (i.e. on cap import) */ - if (ci->i_cap_exporting_mds == mds) { - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; - } - /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; spin_unlock(&session->s_cap_lock); + } else { + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } } if (!ci->i_snap_realm) { @@ -588,6 +586,7 @@ retry: } else { pr_err("ceph_add_cap: couldn't find snap realm %llx\n", realmino); + WARN_ON(!realm); } } @@ -607,10 +606,15 @@ retry: __cap_delay_requeue(mdsc, ci); } - if (flags & CEPH_CAP_FLAG_AUTH) - ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) - ci->i_auth_cap = NULL; + if (flags & CEPH_CAP_FLAG_AUTH) { + if (ci->i_auth_cap == NULL || + ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { + ci->i_auth_cap = cap; + cap->mds_wanted = wanted; + } + } else { + WARN_ON(ci->i_auth_cap == cap); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), @@ -618,7 +622,10 @@ retry: cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; - cap->mds_wanted |= wanted; + if (ceph_seq_cmp(mseq, cap->mseq) > 0) + cap->mds_wanted = wanted; + else + cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; @@ -626,9 +633,6 @@ retry: if (fmode >= 0) __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); - wake_up(&ci->i_cap_wq); - return 0; } /* @@ -641,10 +645,10 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_cap_lock); + spin_lock(&cap->session->s_gen_ttl_lock); gen = cap->session->s_cap_gen; ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_cap_lock); + spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " @@ -663,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap) */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; + int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; @@ -679,6 +683,15 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) if (implemented) *implemented |= cap->implemented; } + /* + * exclude caps issued by non-auth MDS, but are been revoking + * by the auth MDS. The non-auth MDS should be revoking/exporting + * these caps, but the message is delayed. + */ + if (ci->i_auth_cap) { + cap = ci->i_auth_cap; + have &= ~cap->implemented | cap->issued; + } return have; } @@ -765,7 +778,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) if (touch) { struct rb_node *q; - /* touch this + preceeding caps */ + /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { @@ -786,23 +799,29 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) /* * Return true if mask caps are currently being revoked by an MDS. */ -int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask) { - struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; struct rb_node *p; - int ret = 0; - spin_lock(&inode->i_lock); for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); - if (__cap_is_valid(cap) && - (cap->implemented & ~cap->issued & mask)) { - ret = 1; - break; - } + if (cap != ocap && + (cap->implemented & ~cap->issued & mask)) + return 1; } - spin_unlock(&inode->i_lock); + return 0; +} + +int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) +{ + struct inode *inode = &ci->vfs_inode; + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_caps_revoking_other(ci, NULL, mask); + spin_unlock(&ci->i_ceph_lock); dout("ceph_caps_revoking %p %s = %d\n", inode, ceph_cap_string(mask), ret); return ret; @@ -815,11 +834,11 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->i_rdcache_gen) + if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; - if (ci->i_wrbuffer_ref) + if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; return used; } @@ -831,7 +850,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { int want = 0; int mode; - for (mode = 0; mode < 4; mode++) + for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++) if (ci->i_nr_by_mode[mode]) want |= ceph_caps_for_mode(mode); return want; @@ -850,37 +869,62 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; - mds_wanted |= cap->mds_wanted; + if (cap == ci->i_auth_cap) + mds_wanted |= cap->mds_wanted; + else + mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } /* - * called under i_lock + * called under i_ceph_lock */ static int __ceph_is_any_caps(struct ceph_inode_info *ci) { - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0; + return !RB_EMPTY_ROOT(&ci->i_caps); +} + +int ceph_is_any_caps(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = __ceph_is_any_caps(ci); + spin_unlock(&ci->i_ceph_lock); + + return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * - * caller should hold i_lock. + * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ -void __ceph_remove_cap(struct ceph_cap *cap) +void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); /* remove from session list */ spin_lock(&session->s_cap_lock); + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || + cap->cap_gen == session->s_cap_gen)) + __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, + cap->mseq, cap->issue_seq); + if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -901,7 +945,7 @@ void __ceph_remove_cap(struct ceph_cap *cap) ci->i_auth_cap = NULL; if (removed) - ceph_put_cap(cap); + ceph_put_cap(mdsc, cap); if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { struct ceph_snap_realm *realm = ci->i_snap_realm; @@ -928,7 +972,7 @@ static int send_cap_msg(struct ceph_mds_session *session, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, - uid_t uid, gid_t gid, mode_t mode, + kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, u64 follows) @@ -944,7 +988,7 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -972,8 +1016,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_timespec(&fc->atime, atime); fc->time_warp_seq = cpu_to_le32(time_warp_seq); - fc->uid = cpu_to_le32(uid); - fc->gid = cpu_to_le32(gid); + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); fc->xattr_version = cpu_to_le64(xattr_version); @@ -987,15 +1031,14 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -static void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) +void __queue_cap_release(struct ceph_mds_session *session, + u64 ino, u64 cap_id, u32 migrate_seq, + u32 issue_seq) { struct ceph_msg *msg; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; - spin_lock(&session->s_cap_lock); BUG_ON(!session->s_num_cap_releases); msg = list_first_entry(&session->s_cap_releases, struct ceph_msg, list_head); @@ -1005,7 +1048,7 @@ static void __queue_cap_release(struct ceph_mds_session *session, BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); head = msg->front.iov_base; - head->num = cpu_to_le32(le32_to_cpu(head->num) + 1); + le32_add_cpu(&head->num, 1); item = msg->front.iov_base + msg->front.iov_len; item->ino = cpu_to_le64(ino); item->cap_id = cpu_to_le64(cap_id); @@ -1024,12 +1067,11 @@ static void __queue_cap_release(struct ceph_mds_session *session, (int)CEPH_CAPS_PER_RELEASE, (int)msg->front.iov_len); } - spin_unlock(&session->s_cap_lock); } /* * Queue cap releases when an inode is dropped from our cache. Since - * inode is about to be destroyed, there is no need for i_lock. + * inode is about to be destroyed, there is no need for i_ceph_lock. */ void ceph_queue_caps_release(struct inode *inode) { @@ -1039,18 +1081,14 @@ void ceph_queue_caps_release(struct inode *inode) p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); - struct ceph_mds_session *session = cap->session; - - __queue_cap_release(session, ceph_ino(inode), cap->cap_id, - cap->mseq, cap->issue_seq); p = rb_next(p); - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } } /* * Send a cap msg on the given inode. Update our caps state, then - * drop i_lock and send the message. + * drop i_ceph_lock and send the message. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. @@ -1062,13 +1100,13 @@ void ceph_queue_caps_release(struct inode *inode) * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * - * called with i_lock, then drops it. + * called with i_ceph_lock, then drops it. * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, unsigned *pflush_tid) - __releases(cap->ci->vfs_inode->i_lock) + __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; @@ -1078,11 +1116,12 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 size, max_size; struct timespec mtime, atime; int wake = 0; - mode_t mode; - uid_t uid; - gid_t gid; + umode_t mode; + kuid_t uid; + kgid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; int delayed = 0; u64 flush_tid = 0; int i; @@ -1143,6 +1182,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, for (i = 0; i < CEPH_CAP_BITS; i++) if (flushing & (1 << i)) ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; } keep = cap->implemented; @@ -1156,24 +1199,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, mtime = inode->i_mtime; atime = inode->i_atime; time_warp_seq = ci->i_time_warp_seq; - follows = ci->i_snap_realm->cached_context->seq; uid = inode->i_uid; gid = inode->i_gid; mode = inode->i_mode; - if (dropping & CEPH_CAP_XATTR_EXCL) { + if (flushing & CEPH_CAP_XATTR_EXCL) { __ceph_build_xattrs_blob(ci); - xattr_version = ci->i_xattrs.version + 1; + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, - uid, gid, mode, - xattr_version, - (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, + uid, gid, mode, xattr_version, xattr_blob, follows); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); @@ -1181,7 +1222,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, } if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return delayed; } @@ -1193,16 +1234,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Called under i_lock. Takes s_mutex as needed. + * Unless @again is true, skip cap_snaps that were already sent to + * the MDS (i.e., during this session). + * + * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession) + struct ceph_mds_session **psession, + int again) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->vfs_inode; int mds; struct ceph_cap_snap *capsnap; u32 mseq; - struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; /* if session != NULL, we hold session->s_mutex */ u64 next_follows = 0; /* keep track of how far we've gotten through the @@ -1223,7 +1270,7 @@ retry: * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) - continue; + break; /* * if cap writeback already occurred, we should have dropped @@ -1232,7 +1279,20 @@ retry: BUG_ON(capsnap->dirty == 0); /* pick mds, take s_mutex */ - mds = __ceph_get_cap_mds(ci, &mseq); + if (ci->i_auth_cap == NULL) { + dout("no auth cap (migrating?), doing nothing\n"); + goto out; + } + + /* only flush each capsnap once */ + if (!again && !list_empty(&capsnap->flushing_item)) { + dout("already flushed %p, skipping\n", capsnap); + continue; + } + + mds = ci->i_auth_cap->session->s_mds; + mseq = ci->i_auth_cap->mseq; + if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -1240,7 +1300,7 @@ retry: session = NULL; } if (!session) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); @@ -1251,10 +1311,10 @@ retry: } /* * if session == NULL, we raced against a cap - * deletion. retry, and we'll get a better - * @mds value next time. + * deletion or migration. retry, and we'll + * get a better @mds value next time. */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1264,10 +1324,10 @@ retry: list_del_init(&capsnap->flushing_item); list_add_tail(&capsnap->flushing_item, &session->s_cap_snaps_flushing); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - dout("flush_snaps %p cap_snap %p follows %lld size %llu\n", - inode, capsnap, next_follows, capsnap->size); + dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", + inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, @@ -1275,13 +1335,13 @@ retry: &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, - 0, NULL, + capsnap->xattr_version, capsnap->xattr_blob, capsnap->follows); next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); goto retry; } @@ -1290,6 +1350,7 @@ retry: list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); +out: if (psession) *psession = session; else if (session) { @@ -1300,21 +1361,20 @@ retry: static void ceph_flush_snaps(struct ceph_inode_info *ci) { - struct inode *inode = &ci->vfs_inode; - - spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, NULL); - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + __ceph_flush_snaps(ci, NULL, 0); + spin_unlock(&ci->i_ceph_lock); } /* - * Mark caps dirty. If inode is newly dirty, add to the global dirty - * list. + * Mark caps dirty. If inode is newly dirty, return the dirty flags. + * Caller is then responsible for calling __mark_inode_dirty with the + * returned flags value. */ -void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1324,13 +1384,18 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - dout(" inode %p now dirty\n", &ci->vfs_inode); + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p auth cap %p\n", + &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); + WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { - igrab(inode); + ihold(inode); dirty |= I_DIRTY_SYNC; } } @@ -1338,21 +1403,20 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - if (dirty) - __mark_inode_dirty(inode, dirty); __cap_delay_requeue(mdsc, ci); + return dirty; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_lock. + * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1390,32 +1454,21 @@ static int __mark_caps_flushing(struct inode *inode, /* * try to invalidate mapping pages without blocking. */ -static int mapping_is_empty(struct address_space *mapping) -{ - struct page *page = find_get_page(mapping, 0); - - if (!page) - return 1; - - put_page(page); - return 0; -} - static int try_nonblocking_invalidate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); invalidate_mapping_pages(&inode->i_data, 0, -1); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); - if (mapping_is_empty(&inode->i_data) && + if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); @@ -1435,13 +1488,12 @@ static int try_nonblocking_invalidate(struct inode *inode) */ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) - __releases(session->s_mutex) { - struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1456,17 +1508,17 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, if (mdsc->stopping) is_delayed = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; /* flush snaps first time around only */ if (!list_empty(&ci->i_cap_snaps)) - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 0); goto retry_locked; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); @@ -1508,13 +1560,15 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - ci->i_rdcache_gen && /* may have cached pages */ + inode->i_data.nrpages && /* have cached pages */ (file_wanted == 0 || /* no open files */ - (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ + (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { dout("check_caps trying to invalidate on %p\n", inode); if (try_nonblocking_invalidate(inode) < 0) { - if (revoking & CEPH_CAP_FILE_CACHE) { + if (revoking & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_LAZYIO)) { dout("check_caps queuing invalidate\n"); queue_invalidate = 1; ci->i_rdcache_revoking = ci->i_rdcache_gen; @@ -1542,10 +1596,16 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - if (revoking) - dout(" mds%d revoking %s\n", cap->mds, - ceph_cap_string(revoking)); + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", + cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), + ceph_cap_string(cap->implemented), + ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { @@ -1571,7 +1631,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1617,7 +1677,7 @@ ack: if (mutex_trylock(&session->s_mutex) == 0) { dout("inverting session/ino locks on %p\n", session); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (took_snap_rwsem) { up_read(&mdsc->snap_rwsem); took_snap_rwsem = 0; @@ -1631,7 +1691,7 @@ ack: if (down_read_trylock(&mdsc->snap_rwsem) == 0) { dout("inverting snap/in locks on %p\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); took_snap_rwsem = 1; goto retry; @@ -1641,14 +1701,16 @@ ack: if (cap == ci->i_auth_cap && ci->i_dirty_caps) flushing = __mark_caps_flushing(inode, session); + else + flushing = 0; mds = cap->mds; /* remember mds, so we don't repeat */ sent++; - /* __send_cap drops i_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); - goto retry; /* retake i_lock and restart our cap scan. */ + /* __send_cap drops i_ceph_lock */ + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); + goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* @@ -1662,7 +1724,7 @@ ack: else if (!is_delayed || force_requeue) __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (queue_invalidate) ceph_queue_invalidate(inode); @@ -1676,16 +1738,15 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, - unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int unlock_session = session ? 0 : 1; int flushing = 0; + struct ceph_mds_session *session = NULL; retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); goto out; @@ -1696,32 +1757,33 @@ retry: int want = __ceph_caps_wanted(ci); int delayed; - if (!session) { - spin_unlock(&inode->i_lock); + if (!session || session != cap->session) { + spin_unlock(&ci->i_ceph_lock); + if (session) + mutex_unlock(&session->s_mutex); session = cap->session; mutex_lock(&session->s_mutex); goto retry; } - BUG_ON(session != cap->session); if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; flushing = __mark_caps_flushing(inode, session); - /* __send_cap drops i_lock */ + /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, cap->issued | cap->implemented, flushing, flush_tid); if (!delayed) goto out_unlocked; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); out_unlocked: - if (session && unlock_session) + if (session) mutex_unlock(&session->s_mutex); return flushing; } @@ -1734,7 +1796,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) struct ceph_inode_info *ci = ceph_inode(inode); int i, ret = 1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1742,7 +1804,7 @@ static int caps_are_flushed(struct inode *inode, unsigned tid) ret = 0; break; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1790,7 +1852,7 @@ out: spin_unlock(&ci->i_unsafe_lock); } -int ceph_fsync(struct file *file, int datasync) +int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1801,11 +1863,12 @@ int ceph_fsync(struct file *file, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); sync_write_wait(inode); - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) return ret; + mutex_lock(&inode->i_mutex); - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); /* @@ -1820,6 +1883,7 @@ int ceph_fsync(struct file *file, int datasync) } dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); + mutex_unlock(&inode->i_mutex); return ret; } @@ -1839,18 +1903,18 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); if (wait) { - dirty = try_flush_caps(inode, NULL, &flush_tid); + dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return err; } @@ -1873,17 +1937,17 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p capsnap %p\n", inode, cap, capsnap); - __ceph_flush_snaps(ci, &session); + __ceph_flush_snaps(ci, &session, 1); } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } @@ -1900,7 +1964,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_cap *cap; int delayed = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (cap && cap->session == session) { dout("kick_flushing_caps %p cap %p %s\n", inode, @@ -1911,15 +1975,51 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, cap->issued | cap->implemented, ci->i_flushing_caps, NULL); if (delayed) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + } + } +} + +static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; + int delayed = 0; + + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, + ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + + __ceph_flush_snaps(ci, &session, 1); + + if (ci->i_flushing_caps) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &cap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + ci->i_flushing_caps, NULL); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); } + } else { + spin_unlock(&ci->i_ceph_lock); } } @@ -1928,7 +2028,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. * - * Protected by i_lock. + * Protected by i_ceph_lock. */ static void __take_cap_refs(struct ceph_inode_info *ci, int got) { @@ -1941,11 +2041,11 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) if (got & CEPH_CAP_FILE_WR) ci->i_wr_ref++; if (got & CEPH_CAP_FILE_BUFFER) { - if (ci->i_wrbuffer_ref == 0) - igrab(&ci->vfs_inode); - ci->i_wrbuffer_ref++; - dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n", - &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref); + if (ci->i_wb_ref == 0) + ihold(&ci->vfs_inode); + ci->i_wb_ref++; + dout("__take_cap_refs %p wb %d -> %d (?)\n", + &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -1966,7 +2066,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); @@ -1978,11 +2078,20 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } - if (need & CEPH_CAP_FILE_WR) { + /* finish pending truncate */ + while (ci->i_truncate_pending) { + spin_unlock(&ci->i_ceph_lock); + __ceph_do_pending_vmtruncate(inode); + spin_lock(&ci->i_ceph_lock); + } + + have = __ceph_caps_issued(ci, &implemented); + + if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); - if (endoff > ci->i_wanted_max_size) { + if (endoff > ci->i_requested_max_size) { *check_max = 1; ret = 1; } @@ -1997,13 +2106,6 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, goto out; } } - have = __ceph_caps_issued(ci, &implemented); - - /* - * disallow writes while a truncate is pending - */ - if (ci->i_truncate_pending) - have &= ~CEPH_CAP_FILE_WR; if ((have & need) == need) { /* @@ -2027,7 +2129,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(have), ceph_cap_string(need)); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; @@ -2044,16 +2146,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) int check = 0; /* do we need to explicitly request a larger max_size? */ - spin_lock(&inode->i_lock); - if ((endoff >= ci->i_max_size || - endoff > (inode->i_size << 1)) && - endoff > ci->i_wanted_max_size) { + spin_lock(&ci->i_ceph_lock); + if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { dout("write %p at large endoff %llu, req max_size\n", inode, endoff); ci->i_wanted_max_size = endoff; - check = 1; } - spin_unlock(&inode->i_lock); + /* duplicate ceph_check_caps()'s logic */ + if (ci->i_auth_cap && + (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && + ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) + check = 1; + spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } @@ -2090,9 +2195,9 @@ retry: */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); __take_cap_refs(ci, caps); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -2110,7 +2215,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) int last = 0, put = 0, flushsnaps = 0, wake = 0; struct ceph_cap_snap *capsnap; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) @@ -2120,12 +2225,12 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { - if (--ci->i_wrbuffer_ref == 0) { + if (--ci->i_wb_ref == 0) { last++; put++; } - dout("put_cap_refs %p wrbuffer %d -> %d (?)\n", - inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref); + dout("put_cap_refs %p wb %d -> %d (?)\n", + inode, ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { @@ -2143,7 +2248,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); @@ -2153,7 +2258,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) else if (flushsnaps) ceph_flush_snaps(ci); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (put) iput(inode); } @@ -2175,13 +2280,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, int found = 0; struct ceph_cap_snap *capsnap = NULL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; last = !ci->i_wrbuffer_ref; if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; - if (!ci->i_wrbuffer_ref_head) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } @@ -2222,66 +2329,111 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); iput(inode); } else if (complete_capsnap) { ceph_flush_snaps(ci); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); } if (drop_capsnap) iput(inode); } /* + * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. + */ +static void invalidate_aliases(struct inode *inode) +{ + struct dentry *dn, *prev = NULL; + + dout("invalidate_aliases inode %p\n", inode); + d_prune_aliases(inode); + /* + * For non-directory inode, d_find_alias() only returns + * hashed dentry. After calling d_invalidate(), the + * dentry becomes unhashed. + * + * For directory inode, d_find_alias() can return + * unhashed dentry. But directory inode should have + * one alias at most. + */ + while ((dn = d_find_alias(inode))) { + if (dn == prev) { + dput(dn); + break; + } + d_invalidate(dn); + if (prev) + dput(prev); + prev = dn; + } + if (prev) + dput(prev); +} + +/* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * - * caller holds s_mutex and i_lock, we drop both. - * - * return value: - * 0 - ok - * 1 - check_caps on auth cap only (writeback) - * 2 - check_caps (ack revoke) + * caller holds s_mutex and i_ceph_lock, we drop both. */ -static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, +static void handle_cap_grant(struct ceph_mds_client *mdsc, + struct inode *inode, struct ceph_mds_caps *grant, + void *snaptrace, int snaptrace_len, + struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, - struct ceph_buffer *xattr_buf) - __releases(inode->i_lock) - __releases(session->s_mutex) + struct ceph_cap *cap, int issued) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); - int issued, implemented, used, wanted, dirty; + int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - int wake = 0; - int writeback = 0; - int revoked_rdcache = 0; - int queue_invalidate = 0; + bool wake = 0; + bool writeback = 0; + bool queue_trunc = 0; + bool queue_invalidate = 0; + bool queue_revalidate = 0; + bool deleted_inode = 0; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); + + /* + * auth mds of the inode changed. we received the cap export message, + * but still haven't received the cap import message. handle_cap_export + * updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message + * that was sent before the cap import message. So don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + WARN_ON(cap != ci->i_auth_cap); + WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); + seq = cap->seq; + newcaps |= cap->issued; + } + /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { - if (try_nonblocking_invalidate(inode) == 0) { - revoked_rdcache = 1; - } else { + if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { @@ -2289,27 +2441,33 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } + + ceph_fscache_invalidate(inode); } /* side effects now are allowed */ - - issued = __ceph_caps_issued(ci, &implemented); - issued |= implemented | __ceph_caps_dirty(ci); - cap->cap_gen = session->s_cap_gen; + cap->seq = seq; __check_cap_issue(ci, cap, newcaps); - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(grant->mode); - inode->i_uid = le32_to_cpu(grant->uid); - inode->i_gid = le32_to_cpu(grant->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(grant->nlink); + if ((newcaps & CEPH_CAP_AUTH_SHARED) && + (issued & CEPH_CAP_LINK_EXCL) == 0) { + set_nlink(inode, le32_to_cpu(grant->nlink)); + if (inode->i_nlink == 0 && + (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = 1; + } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); @@ -2322,29 +2480,44 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; + ceph_forget_all_cached_acls(inode); } } - /* size/ctime/mtime/atime? */ - ceph_fill_file_size(inode, issued, - le32_to_cpu(grant->truncate_seq), - le64_to_cpu(grant->truncate_size), size); - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); - ceph_fill_file_time(inode, issued, - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, - &atime); - - /* max size increase? */ - if (max_size != ci->i_max_size) { - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); - ci->i_max_size = max_size; - if (max_size >= ci->i_wanted_max_size) { - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; + /* Do we need to revalidate our fscache cookie. Don't bother on the + * first cache cap as we already validate at cookie creation time. */ + if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) + queue_revalidate = 1; + + if (newcaps & CEPH_CAP_ANY_RD) { + /* ctime/mtime/atime? */ + ceph_decode_timespec(&mtime, &grant->mtime); + ceph_decode_timespec(&atime, &grant->atime); + ceph_decode_timespec(&ctime, &grant->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(grant->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { + /* file layout may have changed */ + ci->i_layout = grant->layout; + /* size/truncate_seq? */ + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(grant->truncate_seq), + le64_to_cpu(grant->truncate_size), + size); + /* max size increase? */ + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + dout("max_size %lld -> %llu\n", + ci->i_max_size, max_size); + ci->i_max_size = max_size; + if (max_size >= ci->i_wanted_max_size) { + ci->i_wanted_max_size = 0; /* reset */ + ci->i_requested_max_size = 0; + } + wake = 1; } - wake = 1; } /* check cap bits */ @@ -2359,25 +2532,29 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } - cap->seq = seq; - - /* file layout may have changed */ - ci->i_layout = grant->layout; - /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { - dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), - ceph_cap_string(newcaps)); - if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) - writeback = 1; /* will delay ack */ - else if (dirty & ~newcaps) - check_caps = 1; /* initiate writeback in check_caps */ - else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || - revoked_rdcache) - check_caps = 2; /* send revoke ack in check_caps */ + int revoking = cap->issued & ~newcaps; + + dout("revocation: %s -> %s (revoking %s)\n", + ceph_cap_string(cap->issued), + ceph_cap_string(newcaps), + ceph_cap_string(revoking)); + if (revoking & used & CEPH_CAP_FILE_BUFFER) + writeback = 1; /* initiate writeback; will delay ack */ + else if (revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && + queue_invalidate) + ; /* do nothing yet, invalidation will be queued */ + else if (cap == ci->i_auth_cap) + check_caps = 1; /* check auth cap only */ + else + check_caps = 2; /* check all caps */ cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { @@ -2386,6 +2563,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } else { dout("grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); + /* non-auth MDS is revoking the newly grant caps ? */ + if (cap == ci->i_auth_cap && + __ceph_caps_revoking_other(ci, cap, newcaps)) + check_caps = 2; + cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a @@ -2394,7 +2576,25 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } BUG_ON(cap->issued & ~cap->implemented); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, false); + downgrade_write(&mdsc->snap_rwsem); + kick_flushing_inode_caps(mdsc, session, inode); + up_read(&mdsc->snap_rwsem); + if (newcaps & ~issued) + wake = 1; + } + + if (queue_trunc) { + ceph_queue_vmtruncate(inode); + ceph_queue_revalidate(inode); + } else if (queue_revalidate) + ceph_queue_revalidate(inode); + if (writeback) /* * queue inode for writeback: we can't actually call @@ -2404,8 +2604,10 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); + if (deleted_inode) + invalidate_aliases(inode); if (wake) - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, @@ -2424,10 +2626,10 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; @@ -2460,22 +2662,27 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up(&mdsc->cap_flushing_wq); + wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } } spin_unlock(&mdsc->cap_dirty_lock); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2498,7 +2705,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n", inode, ci, session->s_mds, follows); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->follows == follows) { if (capsnap->flush_tid != flush_tid) { @@ -2521,7 +2728,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (drop) iput(inode); } @@ -2534,7 +2741,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, static void handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session) - __releases(inode->i_lock) + __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2553,10 +2760,12 @@ static void handle_cap_trunc(struct inode *inode, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) + if (queue_trunc) { ceph_queue_vmtruncate(inode); + ceph_fscache_invalidate(inode); + } } /* @@ -2568,89 +2777,200 @@ static void handle_cap_trunc(struct inode *inode, * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_session *tsession = NULL; + struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); - int mds = session->s_mds; + u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); - struct ceph_cap *cap = NULL, *t; - struct rb_node *p; - int remember = 1; + unsigned t_seq, t_mseq; + int target, issued; + int mds = session->s_mds; - dout("handle_cap_export inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + if (ph) { + t_cap_id = le64_to_cpu(ph->cap_id); + t_seq = le32_to_cpu(ph->seq); + t_mseq = le32_to_cpu(ph->mseq); + target = le32_to_cpu(ph->mds); + } else { + t_cap_id = t_seq = t_mseq = 0; + target = -1; + } - spin_lock(&inode->i_lock); + dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n", + inode, ci, mds, mseq, target); +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) + goto out_unlock; - /* make sure we haven't seen a higher mseq */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - t = rb_entry(p, struct ceph_cap, ci_node); - if (ceph_seq_cmp(t->mseq, mseq) > 0) { - dout(" higher mseq on cap from mds%d\n", - t->session->s_mds); - remember = 0; + if (target < 0) { + __ceph_remove_cap(cap, false); + goto out_unlock; + } + + /* + * now we know we haven't received the cap import message yet + * because the exported cap still exist. + */ + + issued = cap->issued; + WARN_ON(issued != cap->implemented); + + tcap = __get_cap_for_mds(ci, target); + if (tcap) { + /* already have caps from the target */ + if (tcap->cap_id != t_cap_id || + ceph_seq_cmp(tcap->seq, t_seq) < 0) { + dout(" updating import cap %p mds%d\n", tcap, target); + tcap->cap_id = t_cap_id; + tcap->seq = t_seq - 1; + tcap->issue_seq = t_seq - 1; + tcap->mseq = t_mseq; + tcap->issued |= issued; + tcap->implemented |= issued; + if (cap == ci->i_auth_cap) + ci->i_auth_cap = tcap; + if (ci->i_flushing_caps && ci->i_auth_cap == tcap) { + spin_lock(&mdsc->cap_dirty_lock); + list_move_tail(&ci->i_flushing_item, + &tcap->session->s_cap_flushing); + spin_unlock(&mdsc->cap_dirty_lock); + } } - if (t->session->s_mds == mds) - cap = t; + __ceph_remove_cap(cap, false); + goto out_unlock; + } else if (tsession) { + /* add placeholder for the export tagert */ + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; + ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); + + __ceph_remove_cap(cap, false); + goto out_unlock; } - if (cap) { - if (remember) { - /* make note */ - ci->i_cap_exporting_mds = mds; - ci->i_cap_exporting_mseq = mseq; - ci->i_cap_exporting_issued = cap->issued; + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + + /* open target session */ + tsession = ceph_mdsc_open_export_target_session(mdsc, target); + if (!IS_ERR(tsession)) { + if (mds > target) { + mutex_lock(&session->s_mutex); + mutex_lock_nested(&tsession->s_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&tsession->s_mutex); + mutex_lock_nested(&session->s_mutex, + SINGLE_DEPTH_NESTING); } - __ceph_remove_cap(cap); + ceph_add_cap_releases(mdsc, tsession); + new_cap = ceph_get_cap(mdsc, NULL); + } else { + WARN_ON(1); + tsession = NULL; + target = -1; } - /* else, we already released it */ + goto retry; - spin_unlock(&inode->i_lock); +out_unlock: + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&session->s_mutex); + if (tsession) { + mutex_unlock(&tsession->s_mutex); + ceph_put_mds_session(tsession); + } + if (new_cap) + ceph_put_cap(mdsc, new_cap); } /* - * Handle cap IMPORT. If there are temp bits from an older EXPORT, - * clean them up. + * Handle cap IMPORT. * - * caller holds s_mutex. + * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, - void *snaptrace, int snaptrace_len) + struct ceph_cap **target_cap, int *old_issued) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; - unsigned issued = le32_to_cpu(im->caps); + int issued; + unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + p_cap_id = 0; + peer = -1; } - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, - false); - downgrade_write(&mdsc->snap_rwsem); - ceph_add_cap(inode, session, cap_id, -1, - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, - NULL /* no caps context */); - try_flush_caps(inode, session, NULL); - up_read(&mdsc->snap_rwsem); + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + +retry: + spin_lock(&ci->i_ceph_lock); + cap = __get_cap_for_mds(ci, mds); + if (!cap) { + if (!new_cap) { + spin_unlock(&ci->i_ceph_lock); + new_cap = ceph_get_cap(mdsc, NULL); + goto retry; + } + cap = new_cap; + } else { + if (new_cap) { + ceph_put_cap(mdsc, new_cap); + new_cap = NULL; + } + } + + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); + + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (ocap && ocap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + ocap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (ocap->seq != le32_to_cpu(ph->seq) || + ocap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, ocap->seq, + ocap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); + } + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + } + + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + + *old_issued = issued; + *target_cap = cap; } /* @@ -2663,27 +2983,33 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; + struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; - int op; + int op, issued; u32 seq, mseq; struct ceph_vino vino; u64 cap_id; u64 size, max_size; u64 tid; void *snaptrace; + size_t snaptrace_len; + void *flock; + void *end; + u32 flock_len; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; - snaptrace = h + 1; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; @@ -2693,30 +3019,55 @@ void ceph_handle_caps(struct ceph_mds_session *session, size = le64_to_cpu(h->size); max_size = le64_to_cpu(h->max_size); + snaptrace = h + 1; + snaptrace_len = le32_to_cpu(h->snap_trace_len); + + if (le16_to_cpu(msg->hdr.version) >= 2) { + void *p = snaptrace + snaptrace_len; + ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; + flock = p; + } else { + flock = NULL; + flock_len = 0; + } + + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } else if (op == CEPH_CAP_OP_EXPORT) { + /* recorded in unused fields */ + peer = (void *)&h->size; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); if (!inode) { dout(" i don't have ino %llx\n", vino.ino); - if (op == CEPH_CAP_OP_IMPORT) + if (op == CEPH_CAP_OP_IMPORT) { + spin_lock(&session->s_cap_lock); __queue_cap_release(session, vino.ino, cap_id, mseq, seq); - - /* - * send any full release message to try to move things - * along for the mds (who clearly thinks we still have this - * cap). - */ - ceph_add_cap_releases(mdsc, session, -1); - ceph_send_cap_releases(mdsc, session); - goto done; + spin_unlock(&session->s_cap_lock); + } + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ @@ -2726,32 +3077,35 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_EXPORT: - handle_cap_export(inode, h, session); - goto done; + handle_cap_export(inode, h, peer, session); + goto done_unlocked; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, - snaptrace, le32_to_cpu(h->snap_trace_len)); - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, - session); + handle_cap_import(mdsc, inode, h, peer, session, + &cap, &issued); + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + msg->middle, session, cap, issued); goto done_unlocked; } /* the rest require a cap */ - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), mds); if (!cap) { dout(" no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), mds); - spin_unlock(&inode->i_lock); - goto done; + spin_unlock(&ci->i_ceph_lock); + goto flush_cap_releases; } - /* note that each of these drops i_lock for us */ + /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: - handle_cap_grant(inode, h, session, cap, msg->middle); + __ceph_caps_issued(ci, &issued); + issued |= __ceph_caps_dirty(ci); + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, + session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -2763,11 +3117,22 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; default: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); pr_err("ceph_handle_caps: unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } + goto done; + +flush_cap_releases: + /* + * send any full release message to try to move things + * along for the mds (who clearly thinks we still have this + * cap). + */ + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); + done: mutex_unlock(&session->s_mutex); done_unlocked: @@ -2813,47 +3178,24 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) */ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - struct ceph_inode_info *ci, *nci = NULL; - struct inode *inode, *ninode = NULL; - struct list_head *p, *n; + struct ceph_inode_info *ci; + struct inode *inode; dout("flush_dirty_caps\n"); spin_lock(&mdsc->cap_dirty_lock); - list_for_each_safe(p, n, &mdsc->cap_dirty) { - if (nci) { - ci = nci; - inode = ninode; - ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; - dout("flush_dirty_caps inode %p (was next inode)\n", - inode); - } else { - ci = list_entry(p, struct ceph_inode_info, - i_dirty_item); - inode = igrab(&ci->vfs_inode); - BUG_ON(!inode); - dout("flush_dirty_caps inode %p\n", inode); - } - if (n != &mdsc->cap_dirty) { - nci = list_entry(n, struct ceph_inode_info, - i_dirty_item); - ninode = igrab(&nci->vfs_inode); - BUG_ON(!ninode); - nci->i_ceph_flags |= CEPH_I_NOFLUSH; - dout("flush_dirty_caps next inode %p, noflush\n", - ninode); - } else { - nci = NULL; - ninode = NULL; - } + while (!list_empty(&mdsc->cap_dirty)) { + ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, + i_dirty_item); + inode = &ci->vfs_inode; + ihold(inode); + dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); + dout("flush_dirty_caps done\n"); } /* @@ -2866,13 +3208,13 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) struct inode *inode = &ci->vfs_inode; int last = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode, ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1); BUG_ON(ci->i_nr_by_mode[fmode] == 0); if (--ci->i_nr_by_mode[fmode] == 0) last++; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); if (last && ci->i_vino.snap == CEPH_NOSNAP) ceph_check_caps(ci, 0, NULL); @@ -2895,7 +3237,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int used, dirty; int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -2913,21 +3255,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode, (cap->issued & unless) == 0)) { if ((cap->issued & drop) && (cap->issued & unless) == 0) { - dout("encode_inode_release %p cap %p %s -> " - "%s\n", inode, cap, + int wanted = __ceph_caps_wanted(ci); + if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) + wanted |= cap->mds_wanted; + dout("encode_inode_release %p cap %p " + "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & ~drop)); + ceph_cap_string(cap->issued & ~drop), + ceph_cap_string(cap->mds_wanted), + ceph_cap_string(wanted)); + cap->issued &= ~drop; cap->implemented &= ~drop; - if (ci->i_ceph_flags & CEPH_I_NODELAY) { - int wanted = __ceph_caps_wanted(ci); - dout(" wanted %s -> %s (act %s)\n", - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(cap->mds_wanted & - ~wanted), - ceph_cap_string(wanted)); - cap->mds_wanted &= wanted; - } + cap->mds_wanted = wanted; } else { dout("encode_inode_release %p cap %p %s" " (force)\n", inode, cap, @@ -2939,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; @@ -2950,7 +3290,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, inode, cap, ceph_cap_string(cap->issued)); } } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -2965,7 +3305,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, /* * force an record for the directory caps if we have a dentry lease. - * this is racy (can't take i_lock and d_lock together), but it + * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ @@ -2984,6 +3324,7 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); + __ceph_mdsc_drop_dentry_lease(dentry); } spin_unlock(&dentry->d_lock); return ret; diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h deleted file mode 100644 index 1818c230561..00000000000 --- a/fs/ceph/ceph_debug.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef _FS_CEPH_DEBUG_H -#define _FS_CEPH_DEBUG_H - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#ifdef CONFIG_CEPH_FS_PRETTYDEBUG - -/* - * wrap pr_debug to include a filename:lineno prefix on each line. - * this incurs some overhead (kernel size and execution time) due to - * the extra function call at each call site. - */ - -# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) -extern const char *ceph_file_part(const char *s, int len); -# define dout(fmt, ...) \ - pr_debug(" %12.12s:%-4d : " fmt, \ - ceph_file_part(__FILE__, sizeof(__FILE__)), \ - __LINE__, ##__VA_ARGS__) -# else -/* faux printk call just to see any compiler warnings. */ -# define dout(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ - } while (0) -# endif - -#else - -/* - * or, just wrap pr_debug - */ -# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__) - -#endif - -#endif diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c index ab6cf35c409..bdce8b1fbd0 100644 --- a/fs/ceph/ceph_frag.c +++ b/fs/ceph/ceph_frag.c @@ -1,7 +1,8 @@ /* * Ceph 'frag' type */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> int ceph_frag_compare(__u32 a, __u32 b) { diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h deleted file mode 100644 index 793f50cb7c2..00000000000 --- a/fs/ceph/ceph_frag.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef _FS_CEPH_FRAG_H -#define _FS_CEPH_FRAG_H - -/* - * "Frags" are a way to describe a subset of a 32-bit number space, - * using a mask and a value to match against that mask. Any given frag - * (subset of the number space) can be partitioned into 2^n sub-frags. - * - * Frags are encoded into a 32-bit word: - * 8 upper bits = "bits" - * 24 lower bits = "value" - * (We could go to 5+27 bits, but who cares.) - * - * We use the _most_ significant bits of the 24 bit value. This makes - * values logically sort. - * - * Unfortunately, because the "bits" field is still in the high bits, we - * can't sort encoded frags numerically. However, it does allow you - * to feed encoded frags as values into frag_contains_value. - */ -static inline __u32 ceph_frag_make(__u32 b, __u32 v) -{ - return (b << 24) | - (v & (0xffffffu << (24-b)) & 0xffffffu); -} -static inline __u32 ceph_frag_bits(__u32 f) -{ - return f >> 24; -} -static inline __u32 ceph_frag_value(__u32 f) -{ - return f & 0xffffffu; -} -static inline __u32 ceph_frag_mask(__u32 f) -{ - return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu; -} -static inline __u32 ceph_frag_mask_shift(__u32 f) -{ - return 24 - ceph_frag_bits(f); -} - -static inline int ceph_frag_contains_value(__u32 f, __u32 v) -{ - return (v & ceph_frag_mask(f)) == ceph_frag_value(f); -} -static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) -{ - /* is sub as specific as us, and contained by us? */ - return ceph_frag_bits(sub) >= ceph_frag_bits(f) && - (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); -} - -static inline __u32 ceph_frag_parent(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f) - 1, - ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); -} -static inline int ceph_frag_is_left_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; -} -static inline int ceph_frag_is_right_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; -} -static inline __u32 ceph_frag_sibling(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f), - ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); -} -static inline __u32 ceph_frag_left_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); -} -static inline __u32 ceph_frag_right_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, - ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); -} -static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) -{ - int newbits = ceph_frag_bits(f) + by; - return ceph_frag_make(newbits, - ceph_frag_value(f) | (i << (24 - newbits))); -} -static inline int ceph_frag_is_leftmost(__u32 f) -{ - return ceph_frag_value(f) == 0; -} -static inline int ceph_frag_is_rightmost(__u32 f) -{ - return ceph_frag_value(f) == ceph_frag_mask(f); -} -static inline __u32 ceph_frag_next(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f), - ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f))); -} - -/* - * comparator to sort frags logically, as when traversing the - * number space in ascending order... - */ -int ceph_frag_compare(__u32 a, __u32 b); - -#endif diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c deleted file mode 100644 index 79d76bc4303..00000000000 --- a/fs/ceph/ceph_fs.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Some non-inline ceph helpers - */ -#include "types.h" - -/* - * return true if @layout appears to be valid - */ -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) -{ - __u32 su = le32_to_cpu(layout->fl_stripe_unit); - __u32 sc = le32_to_cpu(layout->fl_stripe_count); - __u32 os = le32_to_cpu(layout->fl_object_size); - - /* stripe unit, object size must be non-zero, 64k increment */ - if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) - return 0; - if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) - return 0; - /* object size must be a multiple of stripe unit */ - if (os < su || os % su) - return 0; - /* stripe count must be non-zero */ - if (!sc) - return 0; - return 1; -} - - -int ceph_flags_to_mode(int flags) -{ -#ifdef O_DIRECTORY /* fixme */ - if ((flags & O_DIRECTORY) == O_DIRECTORY) - return CEPH_FILE_MODE_PIN; -#endif -#ifdef O_LAZY - if (flags & O_LAZY) - return CEPH_FILE_MODE_LAZY; -#endif - if ((flags & O_APPEND) == O_APPEND) - flags |= O_WRONLY; - - flags &= O_ACCMODE; - if ((flags & O_RDWR) == O_RDWR) - return CEPH_FILE_MODE_RDWR; - if ((flags & O_WRONLY) == O_WRONLY) - return CEPH_FILE_MODE_WR; - return CEPH_FILE_MODE_RD; -} - -int ceph_caps_for_mode(int mode) -{ - switch (mode) { - case CEPH_FILE_MODE_PIN: - return CEPH_CAP_PIN; - case CEPH_FILE_MODE_RD: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; - case CEPH_FILE_MODE_RDWR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | - CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | - CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | - CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - case CEPH_FILE_MODE_WR: - return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | - CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | - CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | - CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; - } - return 0; -} diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h deleted file mode 100644 index 2fa992eaf7d..00000000000 --- a/fs/ceph/ceph_fs.h +++ /dev/null @@ -1,705 +0,0 @@ -/* - * ceph_fs.h - Ceph constants and data types to share between kernel and - * user space. - * - * Most types in this file are defined as little-endian, and are - * primarily intended to describe data structures that pass over the - * wire or that are stored on disk. - * - * LGPL2 - */ - -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H - -#include "msgr.h" -#include "rados.h" - -/* - * Ceph release version - */ -#define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 20 -#define CEPH_VERSION_PATCH 0 - -#define _CEPH_STRINGIFY(x) #x -#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x) -#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \ - "." CEPH_STRINGIFY(z) -#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \ - CEPH_VERSION_MINOR, CEPH_VERSION_PATCH) - -/* - * subprotocol versions. when specific messages types or high-level - * protocols change, bump the affected components. we keep rev - * internal cluster protocols separately from the public, - * client-facing protocol. - */ -#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ -#define CEPH_MON_PROTOCOL 5 /* cluster internal */ -#define CEPH_OSDC_PROTOCOL 24 /* server/client */ -#define CEPH_MDSC_PROTOCOL 32 /* server/client */ -#define CEPH_MONC_PROTOCOL 15 /* server/client */ - - -#define CEPH_INO_ROOT 1 -#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ - -/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ -#define CEPH_MAX_MON 31 - - -/* - * feature bits - */ -#define CEPH_FEATURE_UID 1 -#define CEPH_FEATURE_NOSRCADDR 2 -#define CEPH_FEATURE_FLOCK 4 - -#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK -#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID -#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR -#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR - - -/* - * ceph_file_layout - describe data layout for a file/inode - */ -struct ceph_file_layout { - /* file -> object mapping */ - __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple - of page size. */ - __le32 fl_stripe_count; /* over this many objects */ - __le32 fl_object_size; /* until objects are this big, then move to - new objects */ - __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ - - /* pg -> disk layout */ - __le32 fl_object_stripe_unit; /* for per-object parity, if any */ - - /* object -> pg layout */ - __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ - __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ -} __attribute__ ((packed)); - -#define CEPH_MIN_STRIPE_UNIT 65536 - -int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); - - -/* crypto algorithms */ -#define CEPH_CRYPTO_NONE 0x0 -#define CEPH_CRYPTO_AES 0x1 - -/* security/authentication protocols */ -#define CEPH_AUTH_UNKNOWN 0x0 -#define CEPH_AUTH_NONE 0x1 -#define CEPH_AUTH_CEPHX 0x2 - -#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) - - -/********************************************* - * message layer - */ - -/* - * message types - */ - -/* misc */ -#define CEPH_MSG_SHUTDOWN 1 -#define CEPH_MSG_PING 2 - -/* client <-> monitor */ -#define CEPH_MSG_MON_MAP 4 -#define CEPH_MSG_MON_GET_MAP 5 -#define CEPH_MSG_STATFS 13 -#define CEPH_MSG_STATFS_REPLY 14 -#define CEPH_MSG_MON_SUBSCRIBE 15 -#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 -#define CEPH_MSG_AUTH 17 -#define CEPH_MSG_AUTH_REPLY 18 - -/* client <-> mds */ -#define CEPH_MSG_MDS_MAP 21 - -#define CEPH_MSG_CLIENT_SESSION 22 -#define CEPH_MSG_CLIENT_RECONNECT 23 - -#define CEPH_MSG_CLIENT_REQUEST 24 -#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25 -#define CEPH_MSG_CLIENT_REPLY 26 -#define CEPH_MSG_CLIENT_CAPS 0x310 -#define CEPH_MSG_CLIENT_LEASE 0x311 -#define CEPH_MSG_CLIENT_SNAP 0x312 -#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 - -/* pool ops */ -#define CEPH_MSG_POOLOP_REPLY 48 -#define CEPH_MSG_POOLOP 49 - - -/* osd */ -#define CEPH_MSG_OSD_MAP 41 -#define CEPH_MSG_OSD_OP 42 -#define CEPH_MSG_OSD_OPREPLY 43 - -/* pool operations */ -enum { - POOL_OP_CREATE = 0x01, - POOL_OP_DELETE = 0x02, - POOL_OP_AUID_CHANGE = 0x03, - POOL_OP_CREATE_SNAP = 0x11, - POOL_OP_DELETE_SNAP = 0x12, - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, -}; - -struct ceph_mon_request_header { - __le64 have_version; - __le16 session_mon; - __le64 session_mon_tid; -} __attribute__ ((packed)); - -struct ceph_mon_statfs { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; -} __attribute__ ((packed)); - -struct ceph_statfs { - __le64 kb, kb_used, kb_avail; - __le64 num_objects; -} __attribute__ ((packed)); - -struct ceph_mon_statfs_reply { - struct ceph_fsid fsid; - __le64 version; - struct ceph_statfs st; -} __attribute__ ((packed)); - -const char *ceph_pool_op_name(int op); - -struct ceph_mon_poolop { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 pool; - __le32 op; - __le64 auid; - __le64 snapid; - __le32 name_len; -} __attribute__ ((packed)); - -struct ceph_mon_poolop_reply { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 reply_code; - __le32 epoch; - char has_data; - char data[0]; -} __attribute__ ((packed)); - -struct ceph_mon_unmanaged_snap { - __le64 snapid; -} __attribute__ ((packed)); - -struct ceph_osd_getmap { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 start; -} __attribute__ ((packed)); - -struct ceph_mds_getmap { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; -} __attribute__ ((packed)); - -struct ceph_client_mount { - struct ceph_mon_request_header monhdr; -} __attribute__ ((packed)); - -struct ceph_mon_subscribe_item { - __le64 have_version; __le64 have; - __u8 onetime; -} __attribute__ ((packed)); - -struct ceph_mon_subscribe_ack { - __le32 duration; /* seconds */ - struct ceph_fsid fsid; -} __attribute__ ((packed)); - -/* - * mds states - * > 0 -> in - * <= 0 -> out - */ -#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */ -#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. - empty log. */ -#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */ -#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */ -#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ -#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ -#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ - -#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ -#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed - operations (import, rename, etc.) */ -#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */ -#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */ -#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */ -#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */ -#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */ - -extern const char *ceph_mds_state_name(int s); - - -/* - * metadata lock types. - * - these are bitmasks.. we can compose them - * - they also define the lock ordering by the MDS - * - a few of these are internal to the mds - */ -#define CEPH_LOCK_DVERSION 1 -#define CEPH_LOCK_DN 2 -#define CEPH_LOCK_ISNAP 16 -#define CEPH_LOCK_IVERSION 32 /* mds internal */ -#define CEPH_LOCK_IFILE 64 -#define CEPH_LOCK_IAUTH 128 -#define CEPH_LOCK_ILINK 256 -#define CEPH_LOCK_IDFT 512 /* dir frag tree */ -#define CEPH_LOCK_INEST 1024 /* mds internal */ -#define CEPH_LOCK_IXATTR 2048 -#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ - -/* client_session ops */ -enum { - CEPH_SESSION_REQUEST_OPEN, - CEPH_SESSION_OPEN, - CEPH_SESSION_REQUEST_CLOSE, - CEPH_SESSION_CLOSE, - CEPH_SESSION_REQUEST_RENEWCAPS, - CEPH_SESSION_RENEWCAPS, - CEPH_SESSION_STALE, - CEPH_SESSION_RECALL_STATE, -}; - -extern const char *ceph_session_op_name(int op); - -struct ceph_mds_session_head { - __le32 op; - __le64 seq; - struct ceph_timespec stamp; - __le32 max_caps, max_leases; -} __attribute__ ((packed)); - -/* client_request */ -/* - * metadata ops. - * & 0x001000 -> write op - * & 0x010000 -> follow symlink (e.g. stat(), not lstat()). - & & 0x100000 -> use weird ino/path trace - */ -#define CEPH_MDS_OP_WRITE 0x001000 -enum { - CEPH_MDS_OP_LOOKUP = 0x00100, - CEPH_MDS_OP_GETATTR = 0x00101, - CEPH_MDS_OP_LOOKUPHASH = 0x00102, - CEPH_MDS_OP_LOOKUPPARENT = 0x00103, - - CEPH_MDS_OP_SETXATTR = 0x01105, - CEPH_MDS_OP_RMXATTR = 0x01106, - CEPH_MDS_OP_SETLAYOUT = 0x01107, - CEPH_MDS_OP_SETATTR = 0x01108, - - CEPH_MDS_OP_MKNOD = 0x01201, - CEPH_MDS_OP_LINK = 0x01202, - CEPH_MDS_OP_UNLINK = 0x01203, - CEPH_MDS_OP_RENAME = 0x01204, - CEPH_MDS_OP_MKDIR = 0x01220, - CEPH_MDS_OP_RMDIR = 0x01221, - CEPH_MDS_OP_SYMLINK = 0x01222, - - CEPH_MDS_OP_CREATE = 0x01301, - CEPH_MDS_OP_OPEN = 0x00302, - CEPH_MDS_OP_READDIR = 0x00305, - - CEPH_MDS_OP_LOOKUPSNAP = 0x00400, - CEPH_MDS_OP_MKSNAP = 0x01400, - CEPH_MDS_OP_RMSNAP = 0x01401, - CEPH_MDS_OP_LSSNAP = 0x00402, -}; - -extern const char *ceph_mds_op_name(int op); - - -#define CEPH_SETATTR_MODE 1 -#define CEPH_SETATTR_UID 2 -#define CEPH_SETATTR_GID 4 -#define CEPH_SETATTR_MTIME 8 -#define CEPH_SETATTR_ATIME 16 -#define CEPH_SETATTR_SIZE 32 -#define CEPH_SETATTR_CTIME 64 - -union ceph_mds_request_args { - struct { - __le32 mask; /* CEPH_CAP_* */ - } __attribute__ ((packed)) getattr; - struct { - __le32 mode; - __le32 uid; - __le32 gid; - struct ceph_timespec mtime; - struct ceph_timespec atime; - __le64 size, old_size; /* old_size needed by truncate */ - __le32 mask; /* CEPH_SETATTR_* */ - } __attribute__ ((packed)) setattr; - struct { - __le32 frag; /* which dir fragment */ - __le32 max_entries; /* how many dentries to grab */ - __le32 max_bytes; - } __attribute__ ((packed)) readdir; - struct { - __le32 mode; - __le32 rdev; - } __attribute__ ((packed)) mknod; - struct { - __le32 mode; - } __attribute__ ((packed)) mkdir; - struct { - __le32 flags; - __le32 mode; - __le32 stripe_unit; /* layout for newly created file */ - __le32 stripe_count; /* ... */ - __le32 object_size; - __le32 file_replication; - __le32 preferred; - } __attribute__ ((packed)) open; - struct { - __le32 flags; - } __attribute__ ((packed)) setxattr; - struct { - struct ceph_file_layout layout; - } __attribute__ ((packed)) setlayout; -} __attribute__ ((packed)); - -#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ -#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ - -struct ceph_mds_request_head { - __le64 oldest_client_tid; - __le32 mdsmap_epoch; /* on client */ - __le32 flags; /* CEPH_MDS_FLAG_* */ - __u8 num_retry, num_fwd; /* count retry, fwd attempts */ - __le16 num_releases; /* # include cap/lease release records */ - __le32 op; /* mds op code */ - __le32 caller_uid, caller_gid; - __le64 ino; /* use this ino for openc, mkdir, mknod, - etc. (if replaying) */ - union ceph_mds_request_args args; -} __attribute__ ((packed)); - -/* cap/lease release record */ -struct ceph_mds_request_release { - __le64 ino, cap_id; /* ino and unique cap id */ - __le32 caps, wanted; /* new issued, wanted */ - __le32 seq, issue_seq, mseq; - __le32 dname_seq; /* if releasing a dentry lease, a */ - __le32 dname_len; /* string follows. */ -} __attribute__ ((packed)); - -/* client reply */ -struct ceph_mds_reply_head { - __le32 op; - __le32 result; - __le32 mdsmap_epoch; - __u8 safe; /* true if committed to disk */ - __u8 is_dentry, is_target; /* true if dentry, target inode records - are included with reply */ -} __attribute__ ((packed)); - -/* one for each node split */ -struct ceph_frag_tree_split { - __le32 frag; /* this frag splits... */ - __le32 by; /* ...by this many bits */ -} __attribute__ ((packed)); - -struct ceph_frag_tree_head { - __le32 nsplits; /* num ceph_frag_tree_split records */ - struct ceph_frag_tree_split splits[]; -} __attribute__ ((packed)); - -/* capability issue, for bundling with mds reply */ -struct ceph_mds_reply_cap { - __le32 caps, wanted; /* caps issued, wanted */ - __le64 cap_id; - __le32 seq, mseq; - __le64 realm; /* snap realm */ - __u8 flags; /* CEPH_CAP_FLAG_* */ -} __attribute__ ((packed)); - -#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ - -/* inode record, for bundling with mds reply */ -struct ceph_mds_reply_inode { - __le64 ino; - __le64 snapid; - __le32 rdev; - __le64 version; /* inode version */ - __le64 xattr_version; /* version for xattr blob */ - struct ceph_mds_reply_cap cap; /* caps issued for this inode */ - struct ceph_file_layout layout; - struct ceph_timespec ctime, mtime, atime; - __le32 time_warp_seq; - __le64 size, max_size, truncate_size; - __le32 truncate_seq; - __le32 mode, uid, gid; - __le32 nlink; - __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */ - struct ceph_timespec rctime; - struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ -} __attribute__ ((packed)); -/* followed by frag array, then symlink string, then xattr blob */ - -/* reply_lease follows dname, and reply_inode */ -struct ceph_mds_reply_lease { - __le16 mask; /* lease type(s) */ - __le32 duration_ms; /* lease duration */ - __le32 seq; -} __attribute__ ((packed)); - -struct ceph_mds_reply_dirfrag { - __le32 frag; /* fragment */ - __le32 auth; /* auth mds, if this is a delegation point */ - __le32 ndist; /* number of mds' this is replicated on */ - __le32 dist[]; -} __attribute__ ((packed)); - -/* file access modes */ -#define CEPH_FILE_MODE_PIN 0 -#define CEPH_FILE_MODE_RD 1 -#define CEPH_FILE_MODE_WR 2 -#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */ -#define CEPH_FILE_MODE_LAZY 4 /* lazy io */ -#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */ - -int ceph_flags_to_mode(int flags); - - -/* capability bits */ -#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ - -/* generic cap bits */ -#define CEPH_CAP_GSHARED 1 /* client can reads */ -#define CEPH_CAP_GEXCL 2 /* client can read and update */ -#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */ -#define CEPH_CAP_GRD 8 /* (file) client can read */ -#define CEPH_CAP_GWR 16 /* (file) client can write */ -#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */ -#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ -#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ - -/* per-lock shift */ -#define CEPH_CAP_SAUTH 2 -#define CEPH_CAP_SLINK 4 -#define CEPH_CAP_SXATTR 6 -#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ - -#define CEPH_CAP_BITS 16 - -/* composed values */ -#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) -#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH) -#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK) -#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK) -#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR) -#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR) -#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) -#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) - -/* cap masks (for getattr) */ -#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN -#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */ -#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN -#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED -#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED -#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED -#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED -#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED -#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED -#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED -#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */ -#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED -#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \ - CEPH_CAP_AUTH_SHARED | \ - CEPH_CAP_LINK_SHARED | \ - CEPH_CAP_FILE_SHARED | \ - CEPH_CAP_XATTR_SHARED) - -#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ - CEPH_CAP_LINK_SHARED | \ - CEPH_CAP_XATTR_SHARED | \ - CEPH_CAP_FILE_SHARED) -#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \ - CEPH_CAP_FILE_CACHE) - -#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \ - CEPH_CAP_LINK_EXCL | \ - CEPH_CAP_XATTR_EXCL | \ - CEPH_CAP_FILE_EXCL) -#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ - CEPH_CAP_FILE_EXCL) -#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) -#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ - CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) - -#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ - CEPH_LOCK_IXATTR) - -int ceph_caps_for_mode(int mode); - -enum { - CEPH_CAP_OP_GRANT, /* mds->client grant */ - CEPH_CAP_OP_REVOKE, /* mds->client revoke */ - CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */ - CEPH_CAP_OP_EXPORT, /* mds has exported the cap */ - CEPH_CAP_OP_IMPORT, /* mds has imported the cap */ - CEPH_CAP_OP_UPDATE, /* client->mds update */ - CEPH_CAP_OP_DROP, /* client->mds drop cap bits */ - CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */ - CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */ - CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */ - CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */ - CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */ - CEPH_CAP_OP_RENEW, /* client->mds renewal request */ -}; - -extern const char *ceph_cap_op_name(int op); - -/* - * caps message, used for capability callbacks, acks, requests, etc. - */ -struct ceph_mds_caps { - __le32 op; /* CEPH_CAP_OP_* */ - __le64 ino, realm; - __le64 cap_id; - __le32 seq, issue_seq; - __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */ - __le32 migrate_seq; - __le64 snap_follows; - __le32 snap_trace_len; - - /* authlock */ - __le32 uid, gid, mode; - - /* linklock */ - __le32 nlink; - - /* xattrlock */ - __le32 xattr_len; - __le64 xattr_version; - - /* filelock */ - __le64 size, max_size, truncate_size; - __le32 truncate_seq; - struct ceph_timespec mtime, atime, ctime; - struct ceph_file_layout layout; - __le32 time_warp_seq; -} __attribute__ ((packed)); - -/* cap release msg head */ -struct ceph_mds_cap_release { - __le32 num; /* number of cap_items that follow */ -} __attribute__ ((packed)); - -struct ceph_mds_cap_item { - __le64 ino; - __le64 cap_id; - __le32 migrate_seq, seq; -} __attribute__ ((packed)); - -#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */ -#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */ -#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */ -#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */ - -extern const char *ceph_lease_op_name(int o); - -/* lease msg header */ -struct ceph_mds_lease { - __u8 action; /* CEPH_MDS_LEASE_* */ - __le16 mask; /* which lease */ - __le64 ino; - __le64 first, last; /* snap range */ - __le32 seq; - __le32 duration_ms; /* duration of renewal */ -} __attribute__ ((packed)); -/* followed by a __le32+string for dname */ - -/* client reconnect */ -struct ceph_mds_cap_reconnect { - __le64 cap_id; - __le32 wanted; - __le32 issued; - __le64 size; - struct ceph_timespec mtime, atime; - __le64 snaprealm; - __le64 pathbase; /* base ino for our path to this ino */ -} __attribute__ ((packed)); -/* followed by encoded string */ - -struct ceph_mds_snaprealm_reconnect { - __le64 ino; /* snap realm base */ - __le64 seq; /* snap seq for this snap realm */ - __le64 parent; /* parent realm */ -} __attribute__ ((packed)); - -/* - * snaps - */ -enum { - CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */ - CEPH_SNAP_OP_CREATE, - CEPH_SNAP_OP_DESTROY, - CEPH_SNAP_OP_SPLIT, -}; - -extern const char *ceph_snap_op_name(int o); - -/* snap msg header */ -struct ceph_mds_snap_head { - __le32 op; /* CEPH_SNAP_OP_* */ - __le64 split; /* ino to split off, if any */ - __le32 num_split_inos; /* # inos belonging to new child realm */ - __le32 num_split_realms; /* # child realms udner new child realm */ - __le32 trace_len; /* size of snap trace blob */ -} __attribute__ ((packed)); -/* followed by split ino list, then split realms, then the trace blob */ - -/* - * encode info about a snaprealm, as viewed by a client - */ -struct ceph_mds_snap_realm { - __le64 ino; /* ino */ - __le64 created; /* snap: when created */ - __le64 parent; /* ino: parent realm */ - __le64 parent_since; /* snap: same parent since */ - __le64 seq; /* snap: version */ - __le32 num_snaps; - __le32 num_prior_parent_snaps; -} __attribute__ ((packed)); -/* followed by my snap list, then prior parent snap list */ - -#endif diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c deleted file mode 100644 index bd570015d14..00000000000 --- a/fs/ceph/ceph_hash.c +++ /dev/null @@ -1,118 +0,0 @@ - -#include "types.h" - -/* - * Robert Jenkin's hash function. - * http://burtleburtle.net/bob/hash/evahash.html - * This is in the public domain. - */ -#define mix(a, b, c) \ - do { \ - a = a - b; a = a - c; a = a ^ (c >> 13); \ - b = b - c; b = b - a; b = b ^ (a << 8); \ - c = c - a; c = c - b; c = c ^ (b >> 13); \ - a = a - b; a = a - c; a = a ^ (c >> 12); \ - b = b - c; b = b - a; b = b ^ (a << 16); \ - c = c - a; c = c - b; c = c ^ (b >> 5); \ - a = a - b; a = a - c; a = a ^ (c >> 3); \ - b = b - c; b = b - a; b = b ^ (a << 10); \ - c = c - a; c = c - b; c = c ^ (b >> 15); \ - } while (0) - -unsigned ceph_str_hash_rjenkins(const char *str, unsigned length) -{ - const unsigned char *k = (const unsigned char *)str; - __u32 a, b, c; /* the internal state */ - __u32 len; /* how many key bytes still need mixing */ - - /* Set up the internal state */ - len = length; - a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - b = a; - c = 0; /* variable initialization of internal state */ - - /* handle most of the key */ - while (len >= 12) { - a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) + - ((__u32)k[3] << 24)); - b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) + - ((__u32)k[7] << 24)); - c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) + - ((__u32)k[11] << 24)); - mix(a, b, c); - k = k + 12; - len = len - 12; - } - - /* handle the last 11 bytes */ - c = c + length; - switch (len) { /* all the case statements fall through */ - case 11: - c = c + ((__u32)k[10] << 24); - case 10: - c = c + ((__u32)k[9] << 16); - case 9: - c = c + ((__u32)k[8] << 8); - /* the first byte of c is reserved for the length */ - case 8: - b = b + ((__u32)k[7] << 24); - case 7: - b = b + ((__u32)k[6] << 16); - case 6: - b = b + ((__u32)k[5] << 8); - case 5: - b = b + k[4]; - case 4: - a = a + ((__u32)k[3] << 24); - case 3: - a = a + ((__u32)k[2] << 16); - case 2: - a = a + ((__u32)k[1] << 8); - case 1: - a = a + k[0]; - /* case 0: nothing left to add */ - } - mix(a, b, c); - - return c; -} - -/* - * linux dcache hash - */ -unsigned ceph_str_hash_linux(const char *str, unsigned length) -{ - unsigned long hash = 0; - unsigned char c; - - while (length--) { - c = *str++; - hash = (hash + (c << 4) + (c >> 4)) * 11; - } - return hash; -} - - -unsigned ceph_str_hash(int type, const char *s, unsigned len) -{ - switch (type) { - case CEPH_STR_HASH_LINUX: - return ceph_str_hash_linux(s, len); - case CEPH_STR_HASH_RJENKINS: - return ceph_str_hash_rjenkins(s, len); - default: - return -1; - } -} - -const char *ceph_str_hash_name(int type) -{ - switch (type) { - case CEPH_STR_HASH_LINUX: - return "linux"; - case CEPH_STR_HASH_RJENKINS: - return "rjenkins"; - default: - return "unknown"; - } -} diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h deleted file mode 100644 index 5ac470c433c..00000000000 --- a/fs/ceph/ceph_hash.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FS_CEPH_HASH_H -#define _FS_CEPH_HASH_H - -#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ -#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ - -extern unsigned ceph_str_hash_linux(const char *s, unsigned len); -extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len); - -extern unsigned ceph_str_hash(int type, const char *s, unsigned len); -extern const char *ceph_str_hash_name(int type); - -#endif diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c deleted file mode 100644 index fabd302e577..00000000000 --- a/fs/ceph/crush/crush.c +++ /dev/null @@ -1,151 +0,0 @@ - -#ifdef __KERNEL__ -# include <linux/slab.h> -#else -# include <stdlib.h> -# include <assert.h> -# define kfree(x) do { if (x) free(x); } while (0) -# define BUG_ON(x) assert(!(x)) -#endif - -#include "crush.h" - -const char *crush_bucket_alg_name(int alg) -{ - switch (alg) { - case CRUSH_BUCKET_UNIFORM: return "uniform"; - case CRUSH_BUCKET_LIST: return "list"; - case CRUSH_BUCKET_TREE: return "tree"; - case CRUSH_BUCKET_STRAW: return "straw"; - default: return "unknown"; - } -} - -/** - * crush_get_bucket_item_weight - Get weight of an item in given bucket - * @b: bucket pointer - * @p: item index in bucket - */ -int crush_get_bucket_item_weight(struct crush_bucket *b, int p) -{ - if (p >= b->size) - return 0; - - switch (b->alg) { - case CRUSH_BUCKET_UNIFORM: - return ((struct crush_bucket_uniform *)b)->item_weight; - case CRUSH_BUCKET_LIST: - return ((struct crush_bucket_list *)b)->item_weights[p]; - case CRUSH_BUCKET_TREE: - if (p & 1) - return ((struct crush_bucket_tree *)b)->node_weights[p]; - return 0; - case CRUSH_BUCKET_STRAW: - return ((struct crush_bucket_straw *)b)->item_weights[p]; - } - return 0; -} - -/** - * crush_calc_parents - Calculate parent vectors for the given crush map. - * @map: crush_map pointer - */ -void crush_calc_parents(struct crush_map *map) -{ - int i, b, c; - - for (b = 0; b < map->max_buckets; b++) { - if (map->buckets[b] == NULL) - continue; - for (i = 0; i < map->buckets[b]->size; i++) { - c = map->buckets[b]->items[i]; - BUG_ON(c >= map->max_devices || - c < -map->max_buckets); - if (c >= 0) - map->device_parents[c] = map->buckets[b]->id; - else - map->bucket_parents[-1-c] = map->buckets[b]->id; - } - } -} - -void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) -{ - kfree(b->h.perm); - kfree(b->h.items); - kfree(b); -} - -void crush_destroy_bucket_list(struct crush_bucket_list *b) -{ - kfree(b->item_weights); - kfree(b->sum_weights); - kfree(b->h.perm); - kfree(b->h.items); - kfree(b); -} - -void crush_destroy_bucket_tree(struct crush_bucket_tree *b) -{ - kfree(b->node_weights); - kfree(b); -} - -void crush_destroy_bucket_straw(struct crush_bucket_straw *b) -{ - kfree(b->straws); - kfree(b->item_weights); - kfree(b->h.perm); - kfree(b->h.items); - kfree(b); -} - -void crush_destroy_bucket(struct crush_bucket *b) -{ - switch (b->alg) { - case CRUSH_BUCKET_UNIFORM: - crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); - break; - case CRUSH_BUCKET_LIST: - crush_destroy_bucket_list((struct crush_bucket_list *)b); - break; - case CRUSH_BUCKET_TREE: - crush_destroy_bucket_tree((struct crush_bucket_tree *)b); - break; - case CRUSH_BUCKET_STRAW: - crush_destroy_bucket_straw((struct crush_bucket_straw *)b); - break; - } -} - -/** - * crush_destroy - Destroy a crush_map - * @map: crush_map pointer - */ -void crush_destroy(struct crush_map *map) -{ - int b; - - /* buckets */ - if (map->buckets) { - for (b = 0; b < map->max_buckets; b++) { - if (map->buckets[b] == NULL) - continue; - crush_destroy_bucket(map->buckets[b]); - } - kfree(map->buckets); - } - - /* rules */ - if (map->rules) { - for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); - kfree(map->rules); - } - - kfree(map->bucket_parents); - kfree(map->device_parents); - kfree(map); -} - - diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h deleted file mode 100644 index dcd7e752370..00000000000 --- a/fs/ceph/crush/crush.h +++ /dev/null @@ -1,180 +0,0 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H - -#include <linux/types.h> - -/* - * CRUSH is a pseudo-random data distribution algorithm that - * efficiently distributes input values (typically, data objects) - * across a heterogeneous, structured storage cluster. - * - * The algorithm was originally described in detail in this paper - * (although the algorithm has evolved somewhat since then): - * - * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf - * - * LGPL2 - */ - - -#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ - - -#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ -#define CRUSH_MAX_SET 10 /* max size of a mapping result */ - - -/* - * CRUSH uses user-defined "rules" to describe how inputs should be - * mapped to devices. A rule consists of sequence of steps to perform - * to generate the set of output devices. - */ -struct crush_rule_step { - __u32 op; - __s32 arg1; - __s32 arg2; -}; - -/* step op codes */ -enum { - CRUSH_RULE_NOOP = 0, - CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ - CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ - /* arg2 = type */ - CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ - CRUSH_RULE_EMIT = 4, /* no args */ - CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, - CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, -}; - -/* - * for specifying choose num (arg1) relative to the max parameter - * passed to do_rule - */ -#define CRUSH_CHOOSE_N 0 -#define CRUSH_CHOOSE_N_MINUS(x) (-(x)) - -/* - * The rule mask is used to describe what the rule is intended for. - * Given a ruleset and size of output set, we search through the - * rule list for a matching rule_mask. - */ -struct crush_rule_mask { - __u8 ruleset; - __u8 type; - __u8 min_size; - __u8 max_size; -}; - -struct crush_rule { - __u32 len; - struct crush_rule_mask mask; - struct crush_rule_step steps[0]; -}; - -#define crush_rule_size(len) (sizeof(struct crush_rule) + \ - (len)*sizeof(struct crush_rule_step)) - - - -/* - * A bucket is a named container of other items (either devices or - * other buckets). Items within a bucket are chosen using one of a - * few different algorithms. The table summarizes how the speed of - * each option measures up against mapping stability when items are - * added or removed. - * - * Bucket Alg Speed Additions Removals - * ------------------------------------------------ - * uniform O(1) poor poor - * list O(n) optimal poor - * tree O(log n) good good - * straw O(n) optimal optimal - */ -enum { - CRUSH_BUCKET_UNIFORM = 1, - CRUSH_BUCKET_LIST = 2, - CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 -}; -extern const char *crush_bucket_alg_name(int alg); - -struct crush_bucket { - __s32 id; /* this'll be negative */ - __u16 type; /* non-zero; type=0 is reserved for devices */ - __u8 alg; /* one of CRUSH_BUCKET_* */ - __u8 hash; /* which hash function to use, CRUSH_HASH_* */ - __u32 weight; /* 16-bit fixed point */ - __u32 size; /* num items */ - __s32 *items; - - /* - * cached random permutation: used for uniform bucket and for - * the linear search fallback for the other bucket types. - */ - __u32 perm_x; /* @x for which *perm is defined */ - __u32 perm_n; /* num elements of *perm that are permuted/defined */ - __u32 *perm; -}; - -struct crush_bucket_uniform { - struct crush_bucket h; - __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ -}; - -struct crush_bucket_list { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *sum_weights; /* 16-bit fixed point. element i is sum - of weights 0..i, inclusive */ -}; - -struct crush_bucket_tree { - struct crush_bucket h; /* note: h.size is _tree_ size, not number of - actual items */ - __u8 num_nodes; - __u32 *node_weights; -}; - -struct crush_bucket_straw { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *straws; /* 16-bit fixed point */ -}; - - - -/* - * CRUSH map includes all buckets, rules, etc. - */ -struct crush_map { - struct crush_bucket **buckets; - struct crush_rule **rules; - - /* - * Parent pointers to identify the parent bucket a device or - * bucket in the hierarchy. If an item appears more than - * once, this is the _last_ time it appeared (where buckets - * are processed in bucket id order, from -1 on down to - * -max_buckets. - */ - __u32 *bucket_parents; - __u32 *device_parents; - - __s32 max_buckets; - __u32 max_rules; - __s32 max_devices; -}; - - -/* crush.c */ -extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); -extern void crush_calc_parents(struct crush_map *map); -extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); -extern void crush_destroy_bucket_list(struct crush_bucket_list *b); -extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); -extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); -extern void crush_destroy_bucket(struct crush_bucket *b); -extern void crush_destroy(struct crush_map *map); - -#endif diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c deleted file mode 100644 index 5873aed694b..00000000000 --- a/fs/ceph/crush/hash.c +++ /dev/null @@ -1,149 +0,0 @@ - -#include <linux/types.h> -#include "hash.h" - -/* - * Robert Jenkins' function for mixing 32-bit values - * http://burtleburtle.net/bob/hash/evahash.html - * a, b = random bits, c = input and output - */ -#define crush_hashmix(a, b, c) do { \ - a = a-b; a = a-c; a = a^(c>>13); \ - b = b-c; b = b-a; b = b^(a<<8); \ - c = c-a; c = c-b; c = c^(b>>13); \ - a = a-b; a = a-c; a = a^(c>>12); \ - b = b-c; b = b-a; b = b^(a<<16); \ - c = c-a; c = c-b; c = c^(b>>5); \ - a = a-b; a = a-c; a = a^(c>>3); \ - b = b-c; b = b-a; b = b^(a<<10); \ - c = c-a; c = c-b; c = c^(b>>15); \ - } while (0) - -#define crush_hash_seed 1315423911 - -static __u32 crush_hash32_rjenkins1(__u32 a) -{ - __u32 hash = crush_hash_seed ^ a; - __u32 b = a; - __u32 x = 231232; - __u32 y = 1232; - crush_hashmix(b, x, hash); - crush_hashmix(y, a, hash); - return hash; -} - -static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) -{ - __u32 hash = crush_hash_seed ^ a ^ b; - __u32 x = 231232; - __u32 y = 1232; - crush_hashmix(a, b, hash); - crush_hashmix(x, a, hash); - crush_hashmix(b, y, hash); - return hash; -} - -static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) -{ - __u32 hash = crush_hash_seed ^ a ^ b ^ c; - __u32 x = 231232; - __u32 y = 1232; - crush_hashmix(a, b, hash); - crush_hashmix(c, x, hash); - crush_hashmix(y, a, hash); - crush_hashmix(b, x, hash); - crush_hashmix(y, c, hash); - return hash; -} - -static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) -{ - __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; - __u32 x = 231232; - __u32 y = 1232; - crush_hashmix(a, b, hash); - crush_hashmix(c, d, hash); - crush_hashmix(a, x, hash); - crush_hashmix(y, b, hash); - crush_hashmix(c, x, hash); - crush_hashmix(y, d, hash); - return hash; -} - -static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, - __u32 e) -{ - __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; - __u32 x = 231232; - __u32 y = 1232; - crush_hashmix(a, b, hash); - crush_hashmix(c, d, hash); - crush_hashmix(e, x, hash); - crush_hashmix(y, a, hash); - crush_hashmix(b, x, hash); - crush_hashmix(y, c, hash); - crush_hashmix(d, x, hash); - crush_hashmix(y, e, hash); - return hash; -} - - -__u32 crush_hash32(int type, __u32 a) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return crush_hash32_rjenkins1(a); - default: - return 0; - } -} - -__u32 crush_hash32_2(int type, __u32 a, __u32 b) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return crush_hash32_rjenkins1_2(a, b); - default: - return 0; - } -} - -__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return crush_hash32_rjenkins1_3(a, b, c); - default: - return 0; - } -} - -__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return crush_hash32_rjenkins1_4(a, b, c, d); - default: - return 0; - } -} - -__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return crush_hash32_rjenkins1_5(a, b, c, d, e); - default: - return 0; - } -} - -const char *crush_hash_name(int type) -{ - switch (type) { - case CRUSH_HASH_RJENKINS1: - return "rjenkins1"; - default: - return "unknown"; - } -} diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h deleted file mode 100644 index ff48e110e4b..00000000000 --- a/fs/ceph/crush/hash.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _CRUSH_HASH_H -#define _CRUSH_HASH_H - -#define CRUSH_HASH_RJENKINS1 0 - -#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 - -extern const char *crush_hash_name(int type); - -extern __u32 crush_hash32(int type, __u32 a); -extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); -extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); -extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); -extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, - __u32 e); - -#endif diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c deleted file mode 100644 index a4eec133258..00000000000 --- a/fs/ceph/crush/mapper.c +++ /dev/null @@ -1,609 +0,0 @@ - -#ifdef __KERNEL__ -# include <linux/string.h> -# include <linux/slab.h> -# include <linux/bug.h> -# include <linux/kernel.h> -# ifndef dprintk -# define dprintk(args...) -# endif -#else -# include <string.h> -# include <stdio.h> -# include <stdlib.h> -# include <assert.h> -# define BUG_ON(x) assert(!(x)) -# define dprintk(args...) /* printf(args) */ -# define kmalloc(x, f) malloc(x) -# define kfree(x) free(x) -#endif - -#include "crush.h" -#include "hash.h" - -/* - * Implement the core CRUSH mapping algorithm. - */ - -/** - * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. - * @map: the crush_map - * @ruleset: the storage ruleset id (user defined) - * @type: storage ruleset type (user defined) - * @size: output set size - */ -int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) -{ - int i; - - for (i = 0; i < map->max_rules; i++) { - if (map->rules[i] && - map->rules[i]->mask.ruleset == ruleset && - map->rules[i]->mask.type == type && - map->rules[i]->mask.min_size <= size && - map->rules[i]->mask.max_size >= size) - return i; - } - return -1; -} - - -/* - * bucket choose methods - * - * For each bucket algorithm, we have a "choose" method that, given a - * crush input @x and replica position (usually, position in output set) @r, - * will produce an item in the bucket. - */ - -/* - * Choose based on a random permutation of the bucket. - * - * We used to use some prime number arithmetic to do this, but it - * wasn't very random, and had some other bad behaviors. Instead, we - * calculate an actual random permutation of the bucket members. - * Since this is expensive, we optimize for the r=0 case, which - * captures the vast majority of calls. - */ -static int bucket_perm_choose(struct crush_bucket *bucket, - int x, int r) -{ - unsigned pr = r % bucket->size; - unsigned i, s; - - /* start a new permutation if @x has changed */ - if (bucket->perm_x != x || bucket->perm_n == 0) { - dprintk("bucket %d new x=%d\n", bucket->id, x); - bucket->perm_x = x; - - /* optimize common r=0 case */ - if (pr == 0) { - s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % - bucket->size; - bucket->perm[0] = s; - bucket->perm_n = 0xffff; /* magic value, see below */ - goto out; - } - - for (i = 0; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm_n = 0; - } else if (bucket->perm_n == 0xffff) { - /* clean up after the r=0 case above */ - for (i = 1; i < bucket->size; i++) - bucket->perm[i] = i; - bucket->perm[bucket->perm[0]] = 0; - bucket->perm_n = 1; - } - - /* calculate permutation up to pr */ - for (i = 0; i < bucket->perm_n; i++) - dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); - while (bucket->perm_n <= pr) { - unsigned p = bucket->perm_n; - /* no point in swapping the final entry */ - if (p < bucket->size - 1) { - i = crush_hash32_3(bucket->hash, x, bucket->id, p) % - (bucket->size - p); - if (i) { - unsigned t = bucket->perm[p + i]; - bucket->perm[p + i] = bucket->perm[p]; - bucket->perm[p] = t; - } - dprintk(" perm_choose swap %d with %d\n", p, p+i); - } - bucket->perm_n++; - } - for (i = 0; i < bucket->size; i++) - dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); - - s = bucket->perm[pr]; -out: - dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, - bucket->size, x, r, pr, s); - return bucket->items[s]; -} - -/* uniform */ -static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, - int x, int r) -{ - return bucket_perm_choose(&bucket->h, x, r); -} - -/* list */ -static int bucket_list_choose(struct crush_bucket_list *bucket, - int x, int r) -{ - int i; - - for (i = bucket->h.size-1; i >= 0; i--) { - __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], - r, bucket->h.id); - w &= 0xffff; - dprintk("list_choose i=%d x=%d r=%d item %d weight %x " - "sw %x rand %llx", - i, x, r, bucket->h.items[i], bucket->item_weights[i], - bucket->sum_weights[i], w); - w *= bucket->sum_weights[i]; - w = w >> 16; - /*dprintk(" scaled %llx\n", w);*/ - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - - -/* (binary) tree */ -static int height(int n) -{ - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} - -static int left(int x) -{ - int h = height(x); - return x - (1 << (h-1)); -} - -static int right(int x) -{ - int h = height(x); - return x + (1 << (h-1)); -} - -static int terminal(int x) -{ - return x & 1; -} - -static int bucket_tree_choose(struct crush_bucket_tree *bucket, - int x, int r) -{ - int n, l; - __u32 w; - __u64 t; - - /* start at root */ - n = bucket->num_nodes >> 1; - - while (!terminal(n)) { - /* pick point in [0, w) */ - w = bucket->node_weights[n]; - t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, - bucket->h.id) * (__u64)w; - t = t >> 32; - - /* descend to the left or right? */ - l = left(n); - if (t < bucket->node_weights[l]) - n = l; - else - n = right(n); - } - - return bucket->h.items[n >> 1]; -} - - -/* straw */ - -static int bucket_straw_choose(struct crush_bucket_straw *bucket, - int x, int r) -{ - int i; - int high = 0; - __u64 high_draw = 0; - __u64 draw; - - for (i = 0; i < bucket->h.size; i++) { - draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); - draw &= 0xffff; - draw *= bucket->straws[i]; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - return bucket->h.items[high]; -} - -static int crush_bucket_choose(struct crush_bucket *in, int x, int r) -{ - dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); - switch (in->alg) { - case CRUSH_BUCKET_UNIFORM: - return bucket_uniform_choose((struct crush_bucket_uniform *)in, - x, r); - case CRUSH_BUCKET_LIST: - return bucket_list_choose((struct crush_bucket_list *)in, - x, r); - case CRUSH_BUCKET_TREE: - return bucket_tree_choose((struct crush_bucket_tree *)in, - x, r); - case CRUSH_BUCKET_STRAW: - return bucket_straw_choose((struct crush_bucket_straw *)in, - x, r); - default: - BUG_ON(1); - return in->items[0]; - } -} - -/* - * true if device is marked "out" (failed, fully offloaded) - * of the cluster - */ -static int is_out(struct crush_map *map, __u32 *weight, int item, int x) -{ - if (weight[item] >= 0x10000) - return 0; - if (weight[item] == 0) - return 1; - if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) - < weight[item]) - return 0; - return 1; -} - -/** - * crush_choose - choose numrep distinct items of given type - * @map: the crush_map - * @bucket: the bucket we are choose an item from - * @x: crush input value - * @numrep: the number of items to choose - * @type: the type of item to choose - * @out: pointer to output vector - * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" - * @recurse_to_leaf: true if we want one device under each item of given type - * @out2: second output vector for leaf items (if @recurse_to_leaf) - */ -static int crush_choose(struct crush_map *map, - struct crush_bucket *bucket, - __u32 *weight, - int x, int numrep, int type, - int *out, int outpos, - int firstn, int recurse_to_leaf, - int *out2) -{ - int rep; - int ftotal, flocal; - int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; - int r; - int i; - int item = 0; - int itemtype; - int collide, reject; - const int orig_tries = 5; /* attempts before we fall back to search */ - - dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", - bucket->id, x, outpos, numrep); - - for (rep = outpos; rep < numrep; rep++) { - /* keep trying until we get a non-out, non-colliding item */ - ftotal = 0; - skip_rep = 0; - do { - retry_descent = 0; - in = bucket; /* initial bucket */ - - /* choose through intervening buckets */ - flocal = 0; - do { - collide = 0; - retry_bucket = 0; - r = rep; - if (in->alg == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || numrep >= in->size) - /* r' = r + f_total */ - r += ftotal; - else if (in->size % numrep == 0) - /* r'=r+(n+1)*f_local */ - r += (numrep+1) * - (flocal+ftotal); - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } else { - if (firstn) - /* r' = r + f_total */ - r += ftotal; - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } - - /* bucket choose */ - if (in->size == 0) { - reject = 1; - goto reject; - } - if (flocal >= (in->size>>1) && - flocal > orig_tries) - item = bucket_perm_choose(in, x, r); - else - item = crush_bucket_choose(in, x, r); - BUG_ON(item >= map->max_devices); - - /* desired type? */ - if (item < 0) - itemtype = map->buckets[-1-item]->type; - else - itemtype = 0; - dprintk(" item %d type %d\n", item, itemtype); - - /* keep going? */ - if (itemtype != type) { - BUG_ON(item >= 0 || - (-1-item) >= map->max_buckets); - in = map->buckets[-1-item]; - retry_bucket = 1; - continue; - } - - /* collision? */ - for (i = 0; i < outpos; i++) { - if (out[i] == item) { - collide = 1; - break; - } - } - - reject = 0; - if (recurse_to_leaf) { - if (item < 0) { - if (crush_choose(map, - map->buckets[-1-item], - weight, - x, outpos+1, 0, - out2, outpos, - firstn, 0, - NULL) <= outpos) - /* didn't get leaf */ - reject = 1; - } else { - /* we already have a leaf! */ - out2[outpos] = item; - } - } - - if (!reject) { - /* out? */ - if (itemtype == 0) - reject = is_out(map, weight, - item, x); - else - reject = 0; - } - -reject: - if (reject || collide) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - /* retry locally a few times */ - retry_bucket = 1; - else if (flocal < in->size + orig_tries) - /* exhaustive bucket search */ - retry_bucket = 1; - else if (ftotal < 20) - /* then retry descent */ - retry_descent = 1; - else - /* else give up */ - skip_rep = 1; - dprintk(" reject %d collide %d " - "ftotal %d flocal %d\n", - reject, collide, ftotal, - flocal); - } - } while (retry_bucket); - } while (retry_descent); - - if (skip_rep) { - dprintk("skip rep\n"); - continue; - } - - dprintk("CHOOSE got %d\n", item); - out[outpos] = item; - outpos++; - } - - dprintk("CHOOSE returns %d\n", outpos); - return outpos; -} - - -/** - * crush_do_rule - calculate a mapping with the given input and rule - * @map: the crush_map - * @ruleno: the rule id - * @x: hash input - * @result: pointer to result vector - * @result_max: maximum result size - * @force: force initial replica choice; -1 for none - */ -int crush_do_rule(struct crush_map *map, - int ruleno, int x, int *result, int result_max, - int force, __u32 *weight) -{ - int result_len; - int force_context[CRUSH_MAX_DEPTH]; - int force_pos = -1; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; - int recurse_to_leaf; - int *w; - int wsize = 0; - int *o; - int osize; - int *tmp; - struct crush_rule *rule; - int step; - int i, j; - int numrep; - int firstn; - int rc = -1; - - BUG_ON(ruleno >= map->max_rules); - - rule = map->rules[ruleno]; - result_len = 0; - w = a; - o = b; - - /* - * determine hierarchical context of force, if any. note - * that this may or may not correspond to the specific types - * referenced by the crush rule. - */ - if (force >= 0) { - if (force >= map->max_devices || - map->device_parents[force] == 0) { - /*dprintk("CRUSH: forcefed device dne\n");*/ - rc = -1; /* force fed device dne */ - goto out; - } - if (!is_out(map, weight, force, x)) { - while (1) { - force_context[++force_pos] = force; - if (force >= 0) - force = map->device_parents[force]; - else - force = map->bucket_parents[-1-force]; - if (force == 0) - break; - } - } - } - - for (step = 0; step < rule->len; step++) { - firstn = 0; - switch (rule->steps[step].op) { - case CRUSH_RULE_TAKE: - w[0] = rule->steps[step].arg1; - if (force_pos >= 0) { - BUG_ON(force_context[force_pos] != w[0]); - force_pos--; - } - wsize = 1; - break; - - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: - case CRUSH_RULE_CHOOSE_FIRSTN: - firstn = 1; - case CRUSH_RULE_CHOOSE_LEAF_INDEP: - case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); - - recurse_to_leaf = - rule->steps[step].op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || - rule->steps[step].op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; - - /* reset output */ - osize = 0; - - for (i = 0; i < wsize; i++) { - /* - * see CRUSH_N, CRUSH_N_MINUS macros. - * basically, numrep <= 0 means relative to - * the provided result_max - */ - numrep = rule->steps[step].arg1; - if (numrep <= 0) { - numrep += result_max; - if (numrep <= 0) - continue; - } - j = 0; - if (osize == 0 && force_pos >= 0) { - /* skip any intermediate types */ - while (force_pos && - force_context[force_pos] < 0 && - rule->steps[step].arg2 != - map->buckets[-1 - - force_context[force_pos]]->type) - force_pos--; - o[osize] = force_context[force_pos]; - if (recurse_to_leaf) - c[osize] = force_context[0]; - j++; - force_pos--; - } - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, - x, numrep, - rule->steps[step].arg2, - o+osize, j, - firstn, - recurse_to_leaf, c+osize); - } - - if (recurse_to_leaf) - /* copy final _leaf_ values to output set */ - memcpy(o, c, osize*sizeof(*o)); - - /* swap t and w arrays */ - tmp = o; - o = w; - w = tmp; - wsize = osize; - break; - - - case CRUSH_RULE_EMIT: - for (i = 0; i < wsize && result_len < result_max; i++) { - result[result_len] = w[i]; - result_len++; - } - wsize = 0; - break; - - default: - BUG_ON(1); - } - } - rc = result_len; - -out: - return rc; -} - - diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h deleted file mode 100644 index 98e90046fd9..00000000000 --- a/fs/ceph/crush/mapper.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _CRUSH_MAPPER_H -#define _CRUSH_MAPPER_H - -/* - * CRUSH functions for find rules and then mapping an input to an - * output set. - * - * LGPL2 - */ - -#include "crush.h" - -extern int crush_find_rule(struct crush_map *map, int pool, int type, int size); -extern int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int forcefeed, /* -1 for none */ - __u32 *weights); - -#endif diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c deleted file mode 100644 index f704b3b6242..00000000000 --- a/fs/ceph/crypto.c +++ /dev/null @@ -1,409 +0,0 @@ - -#include "ceph_debug.h" - -#include <linux/err.h> -#include <linux/scatterlist.h> -#include <linux/slab.h> -#include <crypto/hash.h> - -#include "crypto.h" -#include "decode.h" - -int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) -{ - if (*p + sizeof(u16) + sizeof(key->created) + - sizeof(u16) + key->len > end) - return -ERANGE; - ceph_encode_16(p, key->type); - ceph_encode_copy(p, &key->created, sizeof(key->created)); - ceph_encode_16(p, key->len); - ceph_encode_copy(p, key->key, key->len); - return 0; -} - -int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) -{ - ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); - key->type = ceph_decode_16(p); - ceph_decode_copy(p, &key->created, sizeof(key->created)); - key->len = ceph_decode_16(p); - ceph_decode_need(p, end, key->len, bad); - key->key = kmalloc(key->len, GFP_NOFS); - if (!key->key) - return -ENOMEM; - ceph_decode_copy(p, key->key, key->len); - return 0; - -bad: - dout("failed to decode crypto key\n"); - return -EINVAL; -} - -int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) -{ - int inlen = strlen(inkey); - int blen = inlen * 3 / 4; - void *buf, *p; - int ret; - - dout("crypto_key_unarmor %s\n", inkey); - buf = kmalloc(blen, GFP_NOFS); - if (!buf) - return -ENOMEM; - blen = ceph_unarmor(buf, inkey, inkey+inlen); - if (blen < 0) { - kfree(buf); - return blen; - } - - p = buf; - ret = ceph_crypto_key_decode(key, &p, p + blen); - kfree(buf); - if (ret) - return ret; - dout("crypto_key_unarmor key %p type %d len %d\n", key, - key->type, key->len); - return 0; -} - - - -#define AES_KEY_SIZE 16 - -static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) -{ - return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); -} - -const u8 *aes_iv = "cephsageyudagreg"; - -int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) -{ - struct scatterlist sg_in[2], sg_out[1]; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; - int ret; - void *iv; - int ivsize; - size_t zero_padding = (0x10 - (src_len & 0x0f)); - char pad[16]; - - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - memset(pad, zero_padding, zero_padding); - - *dst_len = src_len + zero_padding; - - crypto_blkcipher_setkey((void *)tfm, key, key_len); - sg_init_table(sg_in, 2); - sg_set_buf(&sg_in[0], src, src_len); - sg_set_buf(&sg_in[1], pad, zero_padding); - sg_init_table(sg_out, 1); - sg_set_buf(sg_out, dst, *dst_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - - memcpy(iv, aes_iv, ivsize); - /* - print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, - key, key_len, 1); - print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1, - src, src_len, 1); - print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, - pad, zero_padding, 1); - */ - ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, - src_len + zero_padding); - crypto_free_blkcipher(tfm); - if (ret < 0) - pr_err("ceph_aes_crypt failed %d\n", ret); - /* - print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, - dst, *dst_len, 1); - */ - return 0; -} - -int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src1, size_t src1_len, - const void *src2, size_t src2_len) -{ - struct scatterlist sg_in[3], sg_out[1]; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; - int ret; - void *iv; - int ivsize; - size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); - char pad[16]; - - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - memset(pad, zero_padding, zero_padding); - - *dst_len = src1_len + src2_len + zero_padding; - - crypto_blkcipher_setkey((void *)tfm, key, key_len); - sg_init_table(sg_in, 3); - sg_set_buf(&sg_in[0], src1, src1_len); - sg_set_buf(&sg_in[1], src2, src2_len); - sg_set_buf(&sg_in[2], pad, zero_padding); - sg_init_table(sg_out, 1); - sg_set_buf(sg_out, dst, *dst_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - - memcpy(iv, aes_iv, ivsize); - /* - print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, - key, key_len, 1); - print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1, - src1, src1_len, 1); - print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1, - src2, src2_len, 1); - print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, - pad, zero_padding, 1); - */ - ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in, - src1_len + src2_len + zero_padding); - crypto_free_blkcipher(tfm); - if (ret < 0) - pr_err("ceph_aes_crypt2 failed %d\n", ret); - /* - print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, - dst, *dst_len, 1); - */ - return 0; -} - -int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, - const void *src, size_t src_len) -{ - struct scatterlist sg_in[1], sg_out[2]; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm }; - char pad[16]; - void *iv; - int ivsize; - int ret; - int last_byte; - - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - crypto_blkcipher_setkey((void *)tfm, key, key_len); - sg_init_table(sg_in, 1); - sg_init_table(sg_out, 2); - sg_set_buf(sg_in, src, src_len); - sg_set_buf(&sg_out[0], dst, *dst_len); - sg_set_buf(&sg_out[1], pad, sizeof(pad)); - - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - - memcpy(iv, aes_iv, ivsize); - - /* - print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, - key, key_len, 1); - print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, - src, src_len, 1); - */ - - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); - crypto_free_blkcipher(tfm); - if (ret < 0) { - pr_err("ceph_aes_decrypt failed %d\n", ret); - return ret; - } - - if (src_len <= *dst_len) - last_byte = ((char *)dst)[src_len - 1]; - else - last_byte = pad[src_len - *dst_len - 1]; - if (last_byte <= 16 && src_len >= last_byte) { - *dst_len = src_len - last_byte; - } else { - pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", - last_byte, (int)src_len); - return -EPERM; /* bad padding */ - } - /* - print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, - dst, *dst_len, 1); - */ - return 0; -} - -int ceph_aes_decrypt2(const void *key, int key_len, - void *dst1, size_t *dst1_len, - void *dst2, size_t *dst2_len, - const void *src, size_t src_len) -{ - struct scatterlist sg_in[1], sg_out[3]; - struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); - struct blkcipher_desc desc = { .tfm = tfm }; - char pad[16]; - void *iv; - int ivsize; - int ret; - int last_byte; - - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - sg_init_table(sg_in, 1); - sg_set_buf(sg_in, src, src_len); - sg_init_table(sg_out, 3); - sg_set_buf(&sg_out[0], dst1, *dst1_len); - sg_set_buf(&sg_out[1], dst2, *dst2_len); - sg_set_buf(&sg_out[2], pad, sizeof(pad)); - - crypto_blkcipher_setkey((void *)tfm, key, key_len); - iv = crypto_blkcipher_crt(tfm)->iv; - ivsize = crypto_blkcipher_ivsize(tfm); - - memcpy(iv, aes_iv, ivsize); - - /* - print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, - key, key_len, 1); - print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, - src, src_len, 1); - */ - - ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len); - crypto_free_blkcipher(tfm); - if (ret < 0) { - pr_err("ceph_aes_decrypt failed %d\n", ret); - return ret; - } - - if (src_len <= *dst1_len) - last_byte = ((char *)dst1)[src_len - 1]; - else if (src_len <= *dst1_len + *dst2_len) - last_byte = ((char *)dst2)[src_len - *dst1_len - 1]; - else - last_byte = pad[src_len - *dst1_len - *dst2_len - 1]; - if (last_byte <= 16 && src_len >= last_byte) { - src_len -= last_byte; - } else { - pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", - last_byte, (int)src_len); - return -EPERM; /* bad padding */ - } - - if (src_len < *dst1_len) { - *dst1_len = src_len; - *dst2_len = 0; - } else { - *dst2_len = src_len - *dst1_len; - } - /* - print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, - dst1, *dst1_len, 1); - print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1, - dst2, *dst2_len, 1); - */ - - return 0; -} - - -int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, - const void *src, size_t src_len) -{ - switch (secret->type) { - case CEPH_CRYPTO_NONE: - if (*dst_len < src_len) - return -ERANGE; - memcpy(dst, src, src_len); - *dst_len = src_len; - return 0; - - case CEPH_CRYPTO_AES: - return ceph_aes_decrypt(secret->key, secret->len, dst, - dst_len, src, src_len); - - default: - return -EINVAL; - } -} - -int ceph_decrypt2(struct ceph_crypto_key *secret, - void *dst1, size_t *dst1_len, - void *dst2, size_t *dst2_len, - const void *src, size_t src_len) -{ - size_t t; - - switch (secret->type) { - case CEPH_CRYPTO_NONE: - if (*dst1_len + *dst2_len < src_len) - return -ERANGE; - t = min(*dst1_len, src_len); - memcpy(dst1, src, t); - *dst1_len = t; - src += t; - src_len -= t; - if (src_len) { - t = min(*dst2_len, src_len); - memcpy(dst2, src, t); - *dst2_len = t; - } - return 0; - - case CEPH_CRYPTO_AES: - return ceph_aes_decrypt2(secret->key, secret->len, - dst1, dst1_len, dst2, dst2_len, - src, src_len); - - default: - return -EINVAL; - } -} - -int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, - const void *src, size_t src_len) -{ - switch (secret->type) { - case CEPH_CRYPTO_NONE: - if (*dst_len < src_len) - return -ERANGE; - memcpy(dst, src, src_len); - *dst_len = src_len; - return 0; - - case CEPH_CRYPTO_AES: - return ceph_aes_encrypt(secret->key, secret->len, dst, - dst_len, src, src_len); - - default: - return -EINVAL; - } -} - -int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, - const void *src1, size_t src1_len, - const void *src2, size_t src2_len) -{ - switch (secret->type) { - case CEPH_CRYPTO_NONE: - if (*dst_len < src1_len + src2_len) - return -ERANGE; - memcpy(dst, src1, src1_len); - memcpy(dst + src1_len, src2, src2_len); - *dst_len = src1_len + src2_len; - return 0; - - case CEPH_CRYPTO_AES: - return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, - src1, src1_len, src2, src2_len); - - default: - return -EINVAL; - } -} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h deleted file mode 100644 index 40b502e6bd8..00000000000 --- a/fs/ceph/crypto.h +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef _FS_CEPH_CRYPTO_H -#define _FS_CEPH_CRYPTO_H - -#include "types.h" -#include "buffer.h" - -/* - * cryptographic secret - */ -struct ceph_crypto_key { - int type; - struct ceph_timespec created; - int len; - void *key; -}; - -static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) -{ - kfree(key->key); -} - -extern int ceph_crypto_key_encode(struct ceph_crypto_key *key, - void **p, void *end); -extern int ceph_crypto_key_decode(struct ceph_crypto_key *key, - void **p, void *end); -extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); - -/* crypto.c */ -extern int ceph_decrypt(struct ceph_crypto_key *secret, - void *dst, size_t *dst_len, - const void *src, size_t src_len); -extern int ceph_encrypt(struct ceph_crypto_key *secret, - void *dst, size_t *dst_len, - const void *src, size_t src_len); -extern int ceph_decrypt2(struct ceph_crypto_key *secret, - void *dst1, size_t *dst1_len, - void *dst2, size_t *dst2_len, - const void *src, size_t src_len); -extern int ceph_encrypt2(struct ceph_crypto_key *secret, - void *dst, size_t *dst_len, - const void *src1, size_t src1_len, - const void *src2, size_t src2_len); - -/* armor.c */ -extern int ceph_armor(char *dst, const void *src, const void *end); -extern int ceph_unarmor(void *dst, const char *src, const char *end); - -#endif diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2f5332ddbb..5a743ac141a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/device.h> #include <linux/slab.h> @@ -7,143 +7,49 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/ceph/libceph.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> + #include "super.h" -#include "mds_client.h" -#include "mon_client.h" -#include "auth.h" #ifdef CONFIG_DEBUG_FS -/* - * Implement /sys/kernel/debug/ceph fun - * - * /sys/kernel/debug/ceph/client* - an instance of the ceph client - * .../osdmap - current osdmap - * .../mdsmap - current mdsmap - * .../monmap - current monmap - * .../osdc - active osd requests - * .../mdsc - active mds requests - * .../monc - mon client state - * .../dentry_lru - dump contents of dentry lru - * .../caps - expose cap (reservation) stats - * .../bdi - symlink to ../../bdi/something - */ - -static struct dentry *ceph_debugfs_dir; - -static int monmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - - if (client->monc.monmap == NULL) - return 0; - - seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); - for (i = 0; i < client->monc.monmap->num_mon; i++) { - struct ceph_entity_inst *inst = - &client->monc.monmap->mon_inst[i]; - - seq_printf(s, "\t%s%lld\t%s\n", - ENTITY_NAME(inst->name), - pr_addr(&inst->addr.in_addr)); - } - return 0; -} +#include "mds_client.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; - if (client->mdsc.mdsmap == NULL) + if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); - seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); + seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); + seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); seq_printf(s, "session_timeout %d\n", - client->mdsc.mdsmap->m_session_timeout); + fsc->mdsc->mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", - client->mdsc.mdsmap->m_session_autoclose); - for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { + fsc->mdsc->mdsmap->m_session_autoclose); + for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { struct ceph_entity_addr *addr = - &client->mdsc.mdsmap->m_info[i].addr; - int state = client->mdsc.mdsmap->m_info[i].state; + &fsc->mdsc->mdsmap->m_info[i].addr; + int state = fsc->mdsc->mdsmap->m_info[i].state; - seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), + seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, + ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); } return 0; } -static int osdmap_show(struct seq_file *s, void *p) -{ - int i; - struct ceph_client *client = s->private; - struct rb_node *n; - - if (client->osdc.osdmap == NULL) - return 0; - seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); - seq_printf(s, "flags%s%s\n", - (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? - " NEARFULL" : "", - (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? - " FULL" : ""); - for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = - rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", - pool->id, pool->v.pg_num, pool->pg_num_mask, - pool->v.lpg_num, pool->lpg_num_mask); - } - for (i = 0; i < client->osdc.osdmap->max_osd; i++) { - struct ceph_entity_addr *addr = - &client->osdc.osdmap->osd_addr[i]; - int state = client->osdc.osdmap->osd_state[i]; - char sb[64]; - - seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", - i, pr_addr(&addr->in_addr), - ((client->osdc.osdmap->osd_weight[i]*100) >> 16), - ceph_osdmap_state_str(sb, sizeof(sb), state)); - } - return 0; -} - -static int monc_show(struct seq_file *s, void *p) -{ - struct ceph_client *client = s->private; - struct ceph_mon_generic_request *req; - struct ceph_mon_client *monc = &client->monc; - struct rb_node *rp; - - mutex_lock(&monc->mutex); - - if (monc->have_mdsmap) - seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap); - if (monc->have_osdmap) - seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap); - if (monc->want_next_osdmap) - seq_printf(s, "want next osdmap\n"); - - for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { - __u16 op; - req = rb_entry(rp, struct ceph_mon_generic_request, node); - op = le16_to_cpu(req->request->hdr.type); - if (op == CEPH_MSG_STATFS) - seq_printf(s, "%lld statfs\n", req->tid); - else - seq_printf(s, "%lld unknown\n", req->tid); - } - - mutex_unlock(&monc->mutex); - return 0; -} - +/* + * mdsc debugfs + */ static int mdsc_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; int pathlen; @@ -154,23 +60,28 @@ static int mdsc_show(struct seq_file *s, void *p) for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { req = rb_entry(rp, struct ceph_mds_request, r_node); - if (req->r_request) - seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); - else + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); if (req->r_got_unsafe) - seq_printf(s, "\t(unsafe)"); + seq_puts(s, "\t(unsafe)"); else - seq_printf(s, "\t"); + seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_dentry->d_parent->d_inode), @@ -182,14 +93,19 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); + } else { + seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", - ceph_ino(req->r_old_dentry->d_parent->d_inode), + req->r_old_dentry_dir ? + ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, path ? path : ""); @@ -203,68 +119,19 @@ static int mdsc_show(struct seq_file *s, void *p) seq_printf(s, " %s", req->r_path2); } - seq_printf(s, "\n"); + seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); return 0; } -static int osdc_show(struct seq_file *s, void *pp) -{ - struct ceph_client *client = s->private; - struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; - int i; - - req = rb_entry(p, struct ceph_osd_request, r_node); - - seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - le32_to_cpu(req->r_pgid.pool), - le16_to_cpu(req->r_pgid.ps)); - - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); - - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); - - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); - seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; - } - - seq_printf(s, "\n"); - } - mutex_unlock(&osdc->request_mutex); - return 0; -} - static int caps_show(struct seq_file *s, void *p) { - struct ceph_client *client = s->private; + struct ceph_fs_client *fsc = s->private; int total, avail, used, reserved, min; - ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); + ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" @@ -276,8 +143,8 @@ static int caps_show(struct seq_file *s, void *p) static int dentry_lru_show(struct seq_file *s, void *ptr) { - struct ceph_client *client = s->private; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_dentry_info *di; spin_lock(&mdsc->dentry_lru_lock); @@ -291,199 +158,120 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } -#define DEFINE_SHOW_FUNC(name) \ -static int name##_open(struct inode *inode, struct file *file) \ -{ \ - struct seq_file *sf; \ - int ret; \ - \ - ret = single_open(file, name, NULL); \ - sf = file->private_data; \ - sf->private = inode->i_private; \ - return ret; \ -} \ - \ -static const struct file_operations name##_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -DEFINE_SHOW_FUNC(monmap_show) -DEFINE_SHOW_FUNC(mdsmap_show) -DEFINE_SHOW_FUNC(osdmap_show) -DEFINE_SHOW_FUNC(monc_show) -DEFINE_SHOW_FUNC(mdsc_show) -DEFINE_SHOW_FUNC(osdc_show) -DEFINE_SHOW_FUNC(dentry_lru_show) -DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(mdsmap_show) +CEPH_DEFINE_SHOW_FUNC(mdsc_show) +CEPH_DEFINE_SHOW_FUNC(caps_show) +CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) + +/* + * debugfs + */ static int congestion_kb_set(void *data, u64 val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - client->mount_args->congestion_kb = (int)val; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + fsc->mount_options->congestion_kb = (int)val; return 0; } static int congestion_kb_get(void *data, u64 *val) { - struct ceph_client *client = (struct ceph_client *)data; - - if (client) - *val = (u64)client->mount_args->congestion_kb; + struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; + *val = (u64)fsc->mount_options->congestion_kb; return 0; } - DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); -int __init ceph_debugfs_init(void) -{ - ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); - if (!ceph_debugfs_dir) - return -ENOMEM; - return 0; -} -void ceph_debugfs_cleanup(void) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { - debugfs_remove(ceph_debugfs_dir); + dout("ceph_fs_debugfs_cleanup\n"); + debugfs_remove(fsc->debugfs_bdi); + debugfs_remove(fsc->debugfs_congestion_kb); + debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_caps); + debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove(fsc->debugfs_dentry_lru); } -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - int ret = 0; - char name[80]; - - snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", - PR_FSID(&client->fsid), client->monc.auth->global_id); - - client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); - if (!client->debugfs_dir) - goto out; - - client->monc.debugfs_file = debugfs_create_file("monc", - 0600, - client->debugfs_dir, - client, - &monc_show_fops); - if (!client->monc.debugfs_file) - goto out; - - client->mdsc.debugfs_file = debugfs_create_file("mdsc", - 0600, - client->debugfs_dir, - client, - &mdsc_show_fops); - if (!client->mdsc.debugfs_file) + char name[100]; + int err = -ENOMEM; + + dout("ceph_fs_debugfs_init\n"); + BUG_ON(!fsc->client->debugfs_dir); + fsc->debugfs_congestion_kb = + debugfs_create_file("writeback_congestion_kb", + 0600, + fsc->client->debugfs_dir, + fsc, + &congestion_kb_fops); + if (!fsc->debugfs_congestion_kb) goto out; - client->osdc.debugfs_file = debugfs_create_file("osdc", - 0600, - client->debugfs_dir, - client, - &osdc_show_fops); - if (!client->osdc.debugfs_file) + snprintf(name, sizeof(name), "../../bdi/%s", + dev_name(fsc->backing_dev_info.dev)); + fsc->debugfs_bdi = + debugfs_create_symlink("bdi", + fsc->client->debugfs_dir, + name); + if (!fsc->debugfs_bdi) goto out; - client->debugfs_monmap = debugfs_create_file("monmap", + fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0600, - client->debugfs_dir, - client, - &monmap_show_fops); - if (!client->debugfs_monmap) - goto out; - - client->debugfs_mdsmap = debugfs_create_file("mdsmap", - 0600, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &mdsmap_show_fops); - if (!client->debugfs_mdsmap) - goto out; - - client->debugfs_osdmap = debugfs_create_file("osdmap", - 0600, - client->debugfs_dir, - client, - &osdmap_show_fops); - if (!client->debugfs_osdmap) + if (!fsc->debugfs_mdsmap) goto out; - client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0600, - client->debugfs_dir, - client, - &dentry_lru_show_fops); - if (!client->debugfs_dentry_lru) + fsc->debugfs_mdsc = debugfs_create_file("mdsc", + 0600, + fsc->client->debugfs_dir, + fsc, + &mdsc_show_fops); + if (!fsc->debugfs_mdsc) goto out; - client->debugfs_caps = debugfs_create_file("caps", + fsc->debugfs_caps = debugfs_create_file("caps", 0400, - client->debugfs_dir, - client, + fsc->client->debugfs_dir, + fsc, &caps_show_fops); - if (!client->debugfs_caps) + if (!fsc->debugfs_caps) goto out; - client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", - 0600, - client->debugfs_dir, - client, - &congestion_kb_fops); - if (!client->debugfs_congestion_kb) + fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", + 0600, + fsc->client->debugfs_dir, + fsc, + &dentry_lru_show_fops); + if (!fsc->debugfs_dentry_lru) goto out; - sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev)); - client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir, - name); - return 0; out: - ceph_debugfs_client_cleanup(client); - return ret; + ceph_fs_debugfs_cleanup(fsc); + return err; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) -{ - debugfs_remove(client->debugfs_bdi); - debugfs_remove(client->debugfs_caps); - debugfs_remove(client->debugfs_dentry_lru); - debugfs_remove(client->debugfs_osdmap); - debugfs_remove(client->debugfs_mdsmap); - debugfs_remove(client->debugfs_monmap); - debugfs_remove(client->osdc.debugfs_file); - debugfs_remove(client->mdsc.debugfs_file); - debugfs_remove(client->monc.debugfs_file); - debugfs_remove(client->debugfs_congestion_kb); - debugfs_remove(client->debugfs_dir); -} - -#else // CONFIG_DEBUG_FS - -int __init ceph_debugfs_init(void) -{ - return 0; -} -void ceph_debugfs_cleanup(void) -{ -} +#else /* CONFIG_DEBUG_FS */ -int ceph_debugfs_client_init(struct ceph_client *client) +int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { return 0; } -void ceph_debugfs_client_cleanup(struct ceph_client *client) +void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { } -#endif // CONFIG_DEBUG_FS +#endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h deleted file mode 100644 index 65b3e022eaf..00000000000 --- a/fs/ceph/decode.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef __CEPH_DECODE_H -#define __CEPH_DECODE_H - -#include <asm/unaligned.h> -#include <linux/time.h> - -#include "types.h" - -/* - * in all cases, - * void **p pointer to position pointer - * void *end pointer to end of buffer (last byte + 1) - */ - -static inline u64 ceph_decode_64(void **p) -{ - u64 v = get_unaligned_le64(*p); - *p += sizeof(u64); - return v; -} -static inline u32 ceph_decode_32(void **p) -{ - u32 v = get_unaligned_le32(*p); - *p += sizeof(u32); - return v; -} -static inline u16 ceph_decode_16(void **p) -{ - u16 v = get_unaligned_le16(*p); - *p += sizeof(u16); - return v; -} -static inline u8 ceph_decode_8(void **p) -{ - u8 v = *(u8 *)*p; - (*p)++; - return v; -} -static inline void ceph_decode_copy(void **p, void *pv, size_t n) -{ - memcpy(pv, *p, n); - *p += n; -} - -/* - * bounds check input. - */ -#define ceph_decode_need(p, end, n, bad) \ - do { \ - if (unlikely(*(p) + (n) > (end))) \ - goto bad; \ - } while (0) - -#define ceph_decode_64_safe(p, end, v, bad) \ - do { \ - ceph_decode_need(p, end, sizeof(u64), bad); \ - v = ceph_decode_64(p); \ - } while (0) -#define ceph_decode_32_safe(p, end, v, bad) \ - do { \ - ceph_decode_need(p, end, sizeof(u32), bad); \ - v = ceph_decode_32(p); \ - } while (0) -#define ceph_decode_16_safe(p, end, v, bad) \ - do { \ - ceph_decode_need(p, end, sizeof(u16), bad); \ - v = ceph_decode_16(p); \ - } while (0) -#define ceph_decode_8_safe(p, end, v, bad) \ - do { \ - ceph_decode_need(p, end, sizeof(u8), bad); \ - v = ceph_decode_8(p); \ - } while (0) - -#define ceph_decode_copy_safe(p, end, pv, n, bad) \ - do { \ - ceph_decode_need(p, end, n, bad); \ - ceph_decode_copy(p, pv, n); \ - } while (0) - -/* - * struct ceph_timespec <-> struct timespec - */ -static inline void ceph_decode_timespec(struct timespec *ts, - const struct ceph_timespec *tv) -{ - ts->tv_sec = le32_to_cpu(tv->tv_sec); - ts->tv_nsec = le32_to_cpu(tv->tv_nsec); -} -static inline void ceph_encode_timespec(struct ceph_timespec *tv, - const struct timespec *ts) -{ - tv->tv_sec = cpu_to_le32(ts->tv_sec); - tv->tv_nsec = cpu_to_le32(ts->tv_nsec); -} - -/* - * sockaddr_storage <-> ceph_sockaddr - */ -static inline void ceph_encode_addr(struct ceph_entity_addr *a) -{ - a->in_addr.ss_family = htons(a->in_addr.ss_family); -} -static inline void ceph_decode_addr(struct ceph_entity_addr *a) -{ - a->in_addr.ss_family = ntohs(a->in_addr.ss_family); - WARN_ON(a->in_addr.ss_family == 512); -} - -/* - * encoders - */ -static inline void ceph_encode_64(void **p, u64 v) -{ - put_unaligned_le64(v, (__le64 *)*p); - *p += sizeof(u64); -} -static inline void ceph_encode_32(void **p, u32 v) -{ - put_unaligned_le32(v, (__le32 *)*p); - *p += sizeof(u32); -} -static inline void ceph_encode_16(void **p, u16 v) -{ - put_unaligned_le16(v, (__le16 *)*p); - *p += sizeof(u16); -} -static inline void ceph_encode_8(void **p, u8 v) -{ - *(u8 *)*p = v; - (*p)++; -} -static inline void ceph_encode_copy(void **p, const void *s, int len) -{ - memcpy(*p, s, len); - *p += len; -} - -/* - * filepath, string encoders - */ -static inline void ceph_encode_filepath(void **p, void *end, - u64 ino, const char *path) -{ - u32 len = path ? strlen(path) : 0; - BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end); - ceph_encode_8(p, 1); - ceph_encode_64(p, ino); - ceph_encode_32(p, len); - if (len) - memcpy(*p, path, len); - *p += len; -} - -static inline void ceph_encode_string(void **p, void *end, - const char *s, u32 len) -{ - BUG_ON(*p + sizeof(len) + len > end); - ceph_encode_32(p, len); - if (len) - memcpy(*p, s, len); - *p += len; -} - -#define ceph_encode_need(p, end, n, bad) \ - do { \ - if (unlikely(*(p) + (n) > (end))) \ - goto bad; \ - } while (0) - -#define ceph_encode_64_safe(p, end, v, bad) \ - do { \ - ceph_encode_need(p, end, sizeof(u64), bad); \ - ceph_encode_64(p, v); \ - } while (0) -#define ceph_encode_32_safe(p, end, v, bad) \ - do { \ - ceph_encode_need(p, end, sizeof(u32), bad); \ - ceph_encode_32(p, v); \ - } while (0) -#define ceph_encode_16_safe(p, end, v, bad) \ - do { \ - ceph_encode_need(p, end, sizeof(u16), bad); \ - ceph_encode_16(p, v); \ - } while (0) - -#define ceph_encode_copy_safe(p, end, pv, n, bad) \ - do { \ - ceph_encode_need(p, end, n, bad); \ - ceph_encode_copy(p, pv, n); \ - } while (0) - - -#endif diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f85719310db..c29d6ae6887 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/spinlock.h> #include <linux/fs_struct.h> @@ -7,6 +7,7 @@ #include <linux/sched.h> #include "super.h" +#include "mds_client.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -27,7 +28,7 @@ const struct inode_operations ceph_dir_iops; const struct file_operations ceph_dir_fops; -struct dentry_operations ceph_dentry_ops; +const struct dentry_operations ceph_dentry_ops; /* * Initialize ceph dentry state. @@ -39,14 +40,7 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) - dentry->d_op = &ceph_dentry_ops; - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) - dentry->d_op = &ceph_snapdir_dentry_ops; - else - dentry->d_op = &ceph_snap_dentry_ops; - - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -56,16 +50,41 @@ int ceph_init_dentry(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; } + + if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + d_set_d_op(dentry, &ceph_dentry_ops); + else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + d_set_d_op(dentry, &ceph_snapdir_dentry_ops); + else + d_set_d_op(dentry, &ceph_snap_dentry_ops); + di->dentry = dentry; di->lease_session = NULL; - dentry->d_fsdata = di; dentry->d_time = jiffies; + /* avoid reordering d_fsdata setup so that the check above is safe */ + smp_mb(); + dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); out_unlock: spin_unlock(&dentry->d_lock); return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (!IS_ROOT(dentry)) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* @@ -81,6 +100,14 @@ static unsigned fpos_off(loff_t p) return p & 0xffffffff; } +static int fpos_cmp(loff_t l, loff_t r) +{ + int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r)); + if (v) + return v; + return (int)(fpos_off(l) - fpos_off(r)); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -88,16 +115,15 @@ static unsigned fpos_off(loff_t p) * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * - * I_COMPLETE tells indicates we have all dentries in the dir. It is + * Complete dir indicates that we have all dentries in the dir. It is * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by * the MDS if/when the directory is modified). */ -static int __dcache_readdir(struct file *filp, - void *dirent, filldir_t filldir) +static int __dcache_readdir(struct file *file, struct dir_context *ctx, + u32 shared_gen) { - struct inode *inode = filp->f_dentry->d_inode; - struct ceph_file_info *fi = filp->private_data; - struct dentry *parent = filp->f_dentry; + struct ceph_file_info *fi = file->private_data; + struct dentry *parent = file->f_dentry; struct inode *dir = parent->d_inode; struct list_head *p; struct dentry *dentry, *last; @@ -108,14 +134,14 @@ static int __dcache_readdir(struct file *filp, last = fi->dentry; fi->dentry = NULL; - dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos, - last); + dout("__dcache_readdir %p v%u at %llu (last %p)\n", + dir, shared_gen, ctx->pos, last); - spin_lock(&dcache_lock); + spin_lock(&parent->d_lock); /* start at beginning? */ - if (filp->f_pos == 2 || (last && - filp->f_pos < ceph_dentry(last)->offset)) { + if (ctx->pos == 2 || last == NULL || + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -132,72 +158,68 @@ more: d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; goto out_unlock; } - if (!d_unhashed(dentry) && dentry->d_inode && + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (di->lease_shared_gen == shared_gen && + !d_unhashed(dentry) && dentry->d_inode && ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - filp->f_pos <= di->offset) + fpos_cmp(ctx->pos, di->offset) <= 0) break; dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, dentry->d_name.len, dentry->d_name.name, di->offset, - filp->f_pos, d_unhashed(dentry) ? " unhashed" : "", + ctx->pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); + spin_unlock(&dentry->d_lock); p = p->prev; dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); } - atomic_inc(&dentry->d_count); - spin_unlock(&dcache_lock); - spin_unlock(&inode->i_lock); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, - dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - filp->f_pos = di->offset; - err = filldir(dirent, dentry->d_name.name, - dentry->d_name.len, di->offset, - dentry->d_inode->i_ino, - dentry->d_inode->i_mode >> 12); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } - if (last) { - if (err < 0) { + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, + dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), + dentry->d_inode->i_mode >> 12)) { + if (last) { /* remember our position */ fi->dentry = last; - fi->next_offset = di->offset; - } else { - dput(last); + fi->next_offset = fpos_off(di->offset); } - last = NULL; + dput(dentry); + return 0; } - spin_lock(&inode->i_lock); - spin_lock(&dcache_lock); + ctx->pos = di->offset + 1; + if (last) + dput(last); last = dentry; - if (err < 0) - goto out_unlock; - - p = p->prev; - filp->f_pos++; - - /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ - if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) - goto more; - dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); - err = -EAGAIN; + spin_lock(&parent->d_lock); + p = p->prev; /* advance to next dentry */ + goto more; out_unlock: - spin_unlock(&dcache_lock); - - if (last) { - spin_unlock(&inode->i_lock); + spin_unlock(&parent->d_lock); +out: + if (last) dput(last); - spin_lock(&inode->i_lock); - } - return err; } @@ -220,61 +242,65 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } -static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ceph_readdir(struct file *file, struct dir_context *ctx) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = filp->f_dentry->d_inode; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = &client->mdsc; - unsigned frag = fpos_frag(filp->f_pos); - int off = fpos_off(filp->f_pos); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + unsigned frag = fpos_frag(ctx->pos); + int off = fpos_off(ctx->pos); int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - const int max_entries = client->mount_args->max_readdir; - const int max_bytes = client->mount_args->max_readdir_bytes; - dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); - if (fi->at_end) + dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + if (fi->flags & CEPH_F_ATEND) return 0; /* always start with . and .. */ - if (filp->f_pos == 0) { + if (ctx->pos == 0) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - fi->dir_release_count = ci->i_release_count; + fi->dir_release_count = atomic_read(&ci->i_release_count); dout("readdir off 0 -> '.'\n"); - if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), - inode->i_ino, inode->i_mode >> 12) < 0) + if (!dir_emit(ctx, ".", 1, + ceph_translate_ino(inode->i_sb, inode->i_ino), + inode->i_mode >> 12)) return 0; - filp->f_pos = 1; + ctx->pos = 1; off = 1; } - if (filp->f_pos == 1) { + if (ctx->pos == 1) { + ino_t ino = parent_ino(file->f_dentry); dout("readdir off 1 -> '..'\n"); - if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), - filp->f_dentry->d_parent->d_inode->i_ino, - inode->i_mode >> 12) < 0) + if (!dir_emit(ctx, "..", 2, + ceph_translate_ino(inode->i_sb, ino), + inode->i_mode >> 12)) return 0; - filp->f_pos = 2; + ctx->pos = 2; off = 2; } /* can we use the dcache? */ - spin_lock(&inode->i_lock); - if ((filp->f_pos == 2 || fi->dentry) && - !ceph_test_opt(client, NOASYNCREADDIR) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + spin_lock(&ci->i_ceph_lock); + if ((ctx->pos == 2 || fi->dentry) && + !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && + ceph_snap(inode) != CEPH_SNAPDIR && + __ceph_dir_is_complete(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { - err = __dcache_readdir(filp, dirent, filldir); - if (err != -EAGAIN) { - spin_unlock(&inode->i_lock); + u32 shared_gen = ci->i_shared_gen; + spin_unlock(&ci->i_ceph_lock); + err = __dcache_readdir(file, ctx, shared_gen); + if (err != -EAGAIN) return err; - } + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); + } else { + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&inode->i_lock); if (fi->dentry) { err = note_last_dentry(fi, fi->dentry->d_name.name, fi->dentry->d_name.len); @@ -299,16 +325,19 @@ more: fi->last_readdir = NULL; } - /* requery frag tree, as the frag topology may have changed */ - frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); - dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); - req->r_dentry = dget(filp->f_dentry); + err = ceph_alloc_readdir_reply_buffer(req, inode); + if (err) { + ceph_mdsc_put_request(req); + return err; + } + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); @@ -316,9 +345,6 @@ more: req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); - req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); - req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -331,19 +357,32 @@ more: if (!req->r_did_prepopulate) { dout("readdir !did_prepopulate"); - fi->dir_release_count--; /* preclude I_COMPLETE */ + /* preclude from marking dir complete */ + fi->dir_release_count--; } /* note next offset and last dentry name */ + rinfo = &req->r_reply_info; + if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; + off = fi->next_offset; + } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { - rinfo = &req->r_reply_info; err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname_len[rinfo->dir_nr-1]); @@ -356,27 +395,32 @@ more: rinfo = &fi->last_readdir->r_reply_info; dout("readdir frag %x num %d off %d chunkoff %d\n", frag, rinfo->dir_nr, off, fi->offset); - while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { - u64 pos = ceph_make_fpos(frag, off); + + ctx->pos = ceph_make_fpos(frag, off); + while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, pos, + off, off - fi->offset, rinfo->dir_nr, ctx->pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; - if (filldir(dirent, + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); + if (!dir_emit(ctx, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } off++; - filp->f_pos = pos + 1; + ctx->pos++; } if (fi->last_name) { @@ -389,65 +433,73 @@ more: if (!ceph_frag_is_rightmost(frag)) { frag = ceph_frag_next(frag); off = 0; - filp->f_pos = ceph_make_fpos(frag, off); + ctx->pos = ceph_make_fpos(frag, off); dout("readdir next frag is %x\n", frag); goto more; } - fi->at_end = 1; + fi->flags |= CEPH_F_ATEND; /* * if dir_release_count still matches the dir, no dentries * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&inode->i_lock); - if (ci->i_release_count == fi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; - ci->i_max_offset = filp->f_pos; + __ceph_dir_set_complete(ci, fi->dir_release_count); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - dout("readdir %p filp %p done.\n", inode, filp); + dout("readdir %p file %p done.\n", inode, file); return 0; } -static void reset_readdir(struct ceph_file_info *fi) +static void reset_readdir(struct ceph_file_info *fi, unsigned frag) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; } kfree(fi->last_name); - fi->next_offset = 2; /* compensate for . and .. */ + fi->last_name = NULL; + if (ceph_frag_is_leftmost(frag)) + fi->next_offset = 2; /* compensate for . and .. */ + else + fi->next_offset = 0; if (fi->dentry) { dput(fi->dentry); fi->dentry = NULL; } - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } -static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = offset; + loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; mutex_lock(&inode->i_mutex); - switch (origin) { + retval = -EINVAL; + switch (whence) { case SEEK_END: offset += inode->i_size + 2; /* FIXME */ break; case SEEK_CUR: offset += file->f_pos; + case SEEK_SET: + break; + default: + goto out; } - retval = -EINVAL; - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + + if (offset >= 0) { if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; - fi->at_end = 0; + fi->flags &= ~CEPH_F_ATEND; } retval = offset; @@ -456,42 +508,35 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) * seek to new frag, or seek prior to current chunk. */ if (offset == 0 || - fpos_frag(offset) != fpos_frag(old_offset) || + fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi); + reset_readdir(fi, fpos_frag(offset)); } /* bump dir_release_count if we did a forward seek */ - if (offset > old_offset) + if (fpos_cmp(offset, old_offset) > 0) fi->dir_release_count--; } +out: mutex_unlock(&inode->i_mutex); return retval; } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && - ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */ + ceph_snap(parent) == CEPH_NOSNAP && strcmp(dentry->d_name.name, - client->mount_args->snapdir_name) == 0) { + fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); @@ -499,7 +544,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -534,10 +595,10 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) * the MDS so that it gets our 'caps wanted' value in a single op. */ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int op; int err; @@ -552,35 +613,26 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (err < 0) return ERR_PTR(err); - /* open (but not create!) intent? */ - if (nd && - (nd->flags & LOOKUP_OPEN) && - (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */ - !(nd->intent.open.flags & O_CREAT)) { - int mode = nd->intent.open.create_mode & ~current->fs->umask; - return ceph_lookup_open(dir, dentry, nd, mode, 1); - } - /* can we conclude ENOENT locally? */ if (dentry->d_inode == NULL) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) && + __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); di->lease_shared_gen = ci->i_shared_gen; return NULL; } - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); } op = ceph_snap(dir) == CEPH_SNAPDIR ? @@ -594,6 +646,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); @@ -606,7 +659,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, */ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) { - struct dentry *result = ceph_lookup(dir, dentry, NULL); + struct dentry *result = ceph_lookup(dir, dentry, 0); if (result && !IS_ERR(result)) { /* @@ -624,17 +677,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) } static int ceph_mknod(struct inode *dir, struct dentry *dentry, - int mode, dev_t rdev) + umode_t mode, dev_t rdev) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - dout("mknod in dir %p dentry %p mode 0%o rdev %d\n", + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -652,38 +705,25 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { - dout("create in dir %p dentry %p name '%.*s'\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name); - - if (ceph_snap(dir) != CEPH_NOSNAP) - return -EROFS; - - if (nd) { - BUG_ON((nd->flags & LOOKUP_OPEN) == 0); - dentry = ceph_lookup_open(dir, dentry, nd, mode, 0); - /* hrm, what should i do here if we get aliased? */ - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; - } - - /* fall back to mknod */ - return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0); + return ceph_mknod(dir, dentry, mode, 0); } static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -706,15 +746,17 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (err) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err = -EROFS; int op; @@ -725,7 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) dout("mksnap dir %p snap '%.*s' dn %p\n", dir, dentry->d_name.len, dentry->d_name.name, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { - dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode); + dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { goto out; @@ -747,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: - if (err < 0) + if (!err) + ceph_init_acl(dentry, dentry->d_inode, dir); + else d_drop(dentry); return err; } @@ -755,8 +799,8 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -772,15 +816,19 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */ + req->r_old_dentry = dget(old_dentry); req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + /* release LINK_SHARED on source inode (mds will lock it) */ + req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; err = ceph_mdsc_do_request(mdsc, dir, req); - if (err) + if (err) { d_drop(dentry); - else if (!req->r_reply_info.head->is_dentry) - d_instantiate(dentry, igrab(old_dentry->d_inode)); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } ceph_mdsc_put_request(req); return err; } @@ -796,12 +844,12 @@ static int drop_caps_for_unlink(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); ci->i_ceph_flags |= CEPH_I_NODELAY; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return drop; } @@ -810,8 +858,8 @@ static int drop_caps_for_unlink(struct inode *inode) */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; struct ceph_mds_request *req; int err = -EROFS; @@ -825,7 +873,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("unlink/rmdir dir %p dn %p inode %p\n", dir, dentry, inode); - op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ? + op = S_ISDIR(dentry->d_inode->i_mode) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; @@ -851,8 +899,8 @@ out: static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -866,9 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + ihold(old_dir); req->r_dentry = dget(new_dentry); req->r_num_caps = 2; req->r_old_dentry = dget(old_dentry); + req->r_old_dentry_dir = old_dir; req->r_locked_dir = new_dir; req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; @@ -886,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_i_clear(new_dir, CEPH_I_COMPLETE); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; @@ -927,12 +979,12 @@ static int dentry_lease_is_valid(struct dentry *dentry) spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { + if (di->lease_session) { s = di->lease_session; - spin_lock(&s->s_cap_lock); + spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; ttl = s->s_cap_ttl; - spin_unlock(&s->s_cap_lock); + spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, dentry->d_time) && @@ -969,10 +1021,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); int valid = 0; - spin_lock(&dir->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_shared_gen == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); - spin_unlock(&dir->i_lock); + spin_unlock(&ci->i_ceph_lock); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)ci->i_shared_gen, dentry, (unsigned)di->lease_shared_gen, valid); @@ -982,67 +1034,64 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) /* * Check if cached dentry can be trusted. */ -static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) +static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) { - struct inode *dir = dentry->d_parent->d_inode; + int valid = 0; + struct inode *dir; + + if (flags & LOOKUP_RCU) + return -ECHILD; dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode, ceph_dentry(dentry)->offset); + dir = ceph_get_dentry_parent_inode(dentry); + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - goto out_touch; + valid = 1; + } else if (dentry->d_inode && + ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + valid = 1; + } else if (dentry_lease_is_valid(dentry) || + dir_lease_is_valid(dir, dentry)) { + if (dentry->d_inode) + valid = ceph_is_any_caps(dentry->d_inode); + else + valid = 1; } - if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) - goto out_touch; - if (dentry_lease_is_valid(dentry) || - dir_lease_is_valid(dir, dentry)) - goto out_touch; - - dout("d_revalidate %p invalid\n", dentry); - d_drop(dentry); - return 0; -out_touch: - ceph_dentry_lru_touch(dentry); - return 1; + dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); + if (valid) { + ceph_dentry_lru_touch(dentry); + } else { + ceph_dir_clear_complete(dir); + d_drop(dentry); + } + iput(dir); + return valid; } /* - * When a dentry is released, clear the dir I_COMPLETE if it was part - * of the current dir gen. + * Release our ceph_dentry_info. */ -static void ceph_dentry_release(struct dentry *dentry) +static void ceph_d_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *parent_inode = dentry->d_parent->d_inode; - - if (parent_inode) { - struct ceph_inode_info *ci = ceph_inode(parent_inode); - spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen) { - dout(" clearing %p complete (d_release)\n", - parent_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } - spin_unlock(&parent_inode->i_lock); - } - if (di) { - ceph_dentry_lru_del(dentry); - if (di->lease_session) - ceph_put_mds_session(di->lease_session); - kmem_cache_free(ceph_dentry_cachep, di); - dentry->d_fsdata = NULL; - } + dout("d_release %p\n", dentry); + ceph_dentry_lru_del(dentry); + if (di->lease_session) + ceph_put_mds_session(di->lease_session); + kmem_cache_free(ceph_dentry_cachep, di); + dentry->d_fsdata = NULL; } static int ceph_snapdir_d_revalidate(struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { /* * Eventually, we'll want to revalidate snapped metadata @@ -1051,7 +1100,30 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, return 1; } +/* + * When the VFS prunes a dentry from the cache, we need to clear the + * complete flag on the parent directory. + * + * Called under dentry->d_lock. + */ +static void ceph_d_prune(struct dentry *dentry) +{ + dout("ceph_d_prune %p\n", dentry); + + /* do we have a valid parent? */ + if (IS_ROOT(dentry)) + return; + /* if we are not hashed, we don't affect dir's completeness */ + if (d_unhashed(dentry)) + return; + + /* + * we hold d_lock, so d_parent is stable, and d_fsdata is never + * cleared until d_release + */ + ceph_dir_clear_complete(dentry->d_parent->d_inode); +} /* * read() on a dir. This weird interface hack only works if mounted @@ -1061,19 +1133,20 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct ceph_file_info *cf = file->private_data; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); int left; + const int bufsize = 1024; - if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(1024, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_NOFS); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = - sprintf(cf->dir_info, + snprintf(cf->dir_info, bufsize, "entries: %20lld\n" " files: %20lld\n" " subdirs: %20lld\n" @@ -1107,9 +1180,10 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, * an fsync() on a dir will wait for any uncommitted directory * operations to commit. */ -static int ceph_dir_fsync(struct file *file, int datasync) +static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct list_head *head = &ci->i_unsafe_dirops; struct ceph_mds_request *req; @@ -1117,6 +1191,11 @@ static int ceph_dir_fsync(struct file *file, int datasync) int ret = 0; dout("dir_fsync %p\n", inode); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; @@ -1128,6 +1207,7 @@ static int ceph_dir_fsync(struct file *file, int datasync) do { ceph_mdsc_get_request(req); spin_unlock(&ci->i_unsafe_lock); + dout("dir_fsync %p wait on tid %llu (until %llu)\n", inode, req->r_tid, last_tid); if (req->r_timeout) { @@ -1140,9 +1220,9 @@ static int ceph_dir_fsync(struct file *file, int datasync) } else { wait_for_completion(&req->r_safe_completion); } - spin_lock(&ci->i_unsafe_lock); ceph_mdsc_put_request(req); + spin_lock(&ci->i_unsafe_lock); if (ret || list_empty(head)) break; req = list_entry(head->next, @@ -1150,6 +1230,8 @@ static int ceph_dir_fsync(struct file *file, int datasync) } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); + mutex_unlock(&inode->i_mutex); + return ret; } @@ -1165,13 +1247,11 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_add_tail(&di->lru, &mdsc->dentry_lru); + mdsc->num_dentry++; + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_touch(struct dentry *dn) @@ -1181,12 +1261,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, dn->d_name.len, dn->d_name.name, di->offset); - if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); - } + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_move_tail(&di->lru, &mdsc->dentry_lru); + spin_unlock(&mdsc->dentry_lru_lock); } void ceph_dentry_lru_del(struct dentry *dn) @@ -1196,18 +1274,35 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); - if (di) { - mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_lru_lock); + list_del_init(&di->lru); + mdsc->num_dentry--; + spin_unlock(&mdsc->dentry_lru_lock); +} + +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) +{ + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); } } const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .readdir = ceph_readdir, + .iterate = ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, @@ -1224,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, .mknod = ceph_mknod, .symlink = ceph_symlink, .mkdir = ceph_mkdir, @@ -1232,16 +1329,21 @@ const struct inode_operations ceph_dir_iops = { .rmdir = ceph_unlink, .rename = ceph_rename, .create = ceph_create, + .atomic_open = ceph_atomic_open, }; -struct dentry_operations ceph_dentry_ops = { +const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, - .d_release = ceph_dentry_release, + .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; -struct dentry_operations ceph_snapdir_dentry_ops = { +const struct dentry_operations ceph_snapdir_dentry_ops = { .d_revalidate = ceph_snapdir_d_revalidate, + .d_release = ceph_d_release, }; -struct dentry_operations ceph_snap_dentry_ops = { +const struct dentry_operations ceph_snap_dentry_ops = { + .d_release = ceph_d_release, + .d_prune = ceph_d_prune, }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 4480cb1c63e..8d7d782f438 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -1,27 +1,11 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <asm/unaligned.h> #include "super.h" - -/* - * NFS export support - * - * NFS re-export of a ceph mount is, at present, only semireliable. - * The basic issue is that the Ceph architectures doesn't lend itself - * well to generating filehandles that will remain valid forever. - * - * So, we do our best. If you're lucky, your inode will be in the - * client's cache. If it's not, and you have a connectable fh, then - * the MDS server may be able to find it for you. Otherwise, you get - * ESTALE. - * - * There are ways to this more reliable, but in the non-connectable fh - * case, we won't every work perfectly, and in the connectable case, - * some changes are needed on the MDS side to work better. - */ +#include "mds_client.h" /* * Basic fh @@ -31,194 +15,236 @@ struct ceph_nfs_fh { } __attribute__ ((packed)); /* - * Larger 'connectable' fh that includes parent ino and name hash. - * Use this whenever possible, as it works more reliably. + * Larger fh that includes parent ino. */ struct ceph_nfs_confh { u64 ino, parent_ino; - u32 parent_name_hash; } __attribute__ ((packed)); -static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, - int connectable) +static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, + struct inode *parent_inode) { + int type; struct ceph_nfs_fh *fh = (void *)rawfh; struct ceph_nfs_confh *cfh = (void *)rawfh; - struct dentry *parent = dentry->d_parent; - struct inode *inode = dentry->d_inode; - int type; + int connected_handle_length = sizeof(*cfh)/4; + int handle_length = sizeof(*fh)/4; /* don't re-export snaps */ if (ceph_snap(inode) != CEPH_NOSNAP) return -EINVAL; - if (*max_len >= sizeof(*cfh)) { - dout("encode_fh %p connectable\n", dentry); - cfh->ino = ceph_ino(dentry->d_inode); - cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = parent->d_name.hash; - *max_len = sizeof(*cfh); - type = 2; - } else if (*max_len > sizeof(*fh)) { - if (connectable) - return -ENOSPC; - dout("encode_fh %p\n", dentry); - fh->ino = ceph_ino(dentry->d_inode); - *max_len = sizeof(*fh); - type = 1; + if (parent_inode && (*max_len < connected_handle_length)) { + *max_len = connected_handle_length; + return FILEID_INVALID; + } else if (*max_len < handle_length) { + *max_len = handle_length; + return FILEID_INVALID; + } + + if (parent_inode) { + dout("encode_fh %llx with parent %llx\n", + ceph_ino(inode), ceph_ino(parent_inode)); + cfh->ino = ceph_ino(inode); + cfh->parent_ino = ceph_ino(parent_inode); + *max_len = connected_handle_length; + type = FILEID_INO32_GEN_PARENT; } else { - return -ENOSPC; + dout("encode_fh %llx\n", ceph_ino(inode)); + fh->ino = ceph_ino(inode); + *max_len = handle_length; + type = FILEID_INO32_GEN; } return type; } -/* - * convert regular fh to dentry - * - * FIXME: we should try harder by querying the mds for the ino. - */ -static struct dentry *__fh_to_dentry(struct super_block *sb, - struct ceph_nfs_fh *fh) +static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; int err; - dout("__fh_to_dentry %llx\n", fh->ino); - vino.ino = fh->ino; + vino.ino = ino; vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); + if (!inode) { + struct ceph_mds_request *req; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); + + req->r_ino1 = vino; + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ESTALE); + } dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", - fh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry); + dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); return dentry; } /* - * convert connectable fh to dentry + * convert regular fh to dentry */ -static struct dentry *__cfh_to_dentry(struct super_block *sb, - struct ceph_nfs_confh *cfh) +static struct dentry *ceph_fh_to_dentry(struct super_block *sb, + struct fid *fid, + int fh_len, int fh_type) { - struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; + struct ceph_nfs_fh *fh = (void *)fid->raw; + + if (fh_type != FILEID_INO32_GEN && + fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*fh) / 4) + return NULL; + + dout("fh_to_dentry %llx\n", fh->ino); + return __fh_to_dentry(sb, fh->ino); +} + +static struct dentry *__get_parent(struct super_block *sb, + struct dentry *child, u64 ino) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; + struct ceph_mds_request *req; struct inode *inode; struct dentry *dentry; - struct ceph_vino vino; int err; - dout("__cfh_to_dentry %llx (%llx/%x)\n", - cfh->ino, cfh->parent_ino, cfh->parent_name_hash); - - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) { - struct ceph_mds_request *req; - - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, - USE_ANY_MDS); - if (IS_ERR(req)) - return ERR_CAST(req); + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, + USE_ANY_MDS); + if (IS_ERR(req)) + return ERR_CAST(req); - req->r_ino1 = vino; - req->r_ino2.ino = cfh->parent_ino; - req->r_ino2.snap = CEPH_NOSNAP; - req->r_path2 = kmalloc(16, GFP_NOFS); - snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); - req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, NULL, req); - ceph_mdsc_put_request(req); - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(err ? err : -ESTALE); + if (child) { + req->r_inode = child->d_inode; + ihold(child->d_inode); + } else { + req->r_ino1 = (struct ceph_vino) { + .ino = ino, + .snap = CEPH_NOSNAP, + }; } + req->r_num_caps = 1; + err = ceph_mdsc_do_request(mdsc, NULL, req); + inode = req->r_target_inode; + if (inode) + ihold(inode); + ceph_mdsc_put_request(req); + if (!inode) + return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); if (IS_ERR(dentry)) { - pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); iput(inode); return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { - iput(inode); + dput(dentry); return ERR_PTR(err); } - dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry); + dout("__get_parent ino %llx parent %p ino %llx.%llx\n", + child ? ceph_ino(child->d_inode) : ino, + dentry, ceph_vinop(inode)); return dentry; } -static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) +static struct dentry *ceph_get_parent(struct dentry *child) { - if (fh_type == 1) - return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw); - else - return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw); + /* don't re-export snaps */ + if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + return ERR_PTR(-EINVAL); + + dout("get_parent %p ino %llx.%llx\n", + child, ceph_vinop(child->d_inode)); + return __get_parent(child->d_sb, child, 0); } /* - * get parent, if possible. - * - * FIXME: we could do better by querying the mds to discover the - * parent. + * convert regular fh to parent */ static struct dentry *ceph_fh_to_parent(struct super_block *sb, - struct fid *fid, + struct fid *fid, int fh_len, int fh_type) { struct ceph_nfs_confh *cfh = (void *)fid->raw; - struct ceph_vino vino; - struct inode *inode; struct dentry *dentry; - int err; - if (fh_type == 1) - return ERR_PTR(-ESTALE); + if (fh_type != FILEID_INO32_GEN_PARENT) + return NULL; + if (fh_len < sizeof(*cfh) / 4) + return NULL; - pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino, - cfh->parent_name_hash); + dout("fh_to_parent %llx\n", cfh->parent_ino); + dentry = __get_parent(sb, NULL, cfh->ino); + if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + dentry = __fh_to_dentry(sb, cfh->parent_ino); + return dentry; +} - vino.ino = cfh->ino; - vino.snap = CEPH_NOSNAP; - inode = ceph_find_inode(sb, vino); - if (!inode) - return ERR_PTR(-ESTALE); +static int ceph_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct ceph_mds_client *mdsc; + struct ceph_mds_request *req; + int err; - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", - cfh->ino, inode); - iput(inode); - return dentry; - } - err = ceph_init_dentry(dentry); - if (err < 0) { - iput(inode); - return ERR_PTR(err); + mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, + USE_ANY_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + + mutex_lock(&parent->d_inode->i_mutex); + + req->r_inode = child->d_inode; + ihold(child->d_inode); + req->r_ino2 = ceph_vino(parent->d_inode); + req->r_locked_dir = parent->d_inode; + req->r_num_caps = 2; + err = ceph_mdsc_do_request(mdsc, NULL, req); + + mutex_unlock(&parent->d_inode->i_mutex); + + if (!err) { + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + memcpy(name, rinfo->dname, rinfo->dname_len); + name[rinfo->dname_len] = 0; + dout("get_name %p ino %llx.%llx name %s\n", + child, ceph_vinop(child->d_inode), name); + } else { + dout("get_name %p ino %llx.%llx err %d\n", + child, ceph_vinop(child->d_inode), err); } - dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry); - return dentry; + + ceph_mdsc_put_request(req); + return err; } const struct export_operations ceph_export_ops = { .encode_fh = ceph_encode_fh, .fh_to_dentry = ceph_fh_to_dentry, .fh_to_parent = ceph_fh_to_parent, + .get_parent = ceph_get_parent, + .get_name = ceph_get_name, }; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6251a1574b9..302085100c2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1,13 +1,18 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> +#include <linux/aio.h> +#include <linux/falloc.h> #include "super.h" #include "mds_client.h" +#include "cache.h" /* * Ceph file operations @@ -38,8 +43,8 @@ static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_client *client = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -53,7 +58,6 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) req->r_fmode = ceph_flags_to_mode(flags); req->r_args.open.flags = cpu_to_le32(flags); req->r_args.open.mode = cpu_to_le32(create_mode); - req->r_args.open.preferred = cpu_to_le32(-1); out: return req; } @@ -66,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: + /* First file open request creates the cookie, we want to keep + * this cookie around for the filetime of the inode as not to + * have to worry about fscache register / revoke / operation + * races. + * + * Also, if we know the operation is going to invalidate data + * (non readonly) just nuke the cache right away. + */ + ceph_fscache_register_inode_cookie(mdsc->fsc, ci); + if ((fmode & CEPH_FILE_MODE_WR)) + ceph_fscache_invalidate(inode); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -106,9 +124,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* - * If the filp already has private_data, that means the file was - * already opened by intent during lookup, and we do nothing. - * * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -117,11 +132,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) int ceph_open(struct inode *inode, struct file *file) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -146,18 +161,20 @@ int ceph_open(struct inode *inode, struct file *file) /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ - spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + spin_lock(&ci->i_ceph_lock); + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); @@ -165,7 +182,7 @@ int ceph_open(struct inode *inode, struct file *file) inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ if ((issued & wanted) != wanted && @@ -177,10 +194,11 @@ int ceph_open(struct inode *inode, struct file *file) } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { __ceph_get_fmode(ci, fmode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } - spin_unlock(&inode->i_lock); + + spin_unlock(&ci->i_ceph_lock); dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); req = prepare_open_request(inode->i_sb, flags, 0); @@ -188,9 +206,14 @@ int ceph_open(struct inode *inode, struct file *file) err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + if (flags & O_CREAT) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); @@ -201,36 +224,34 @@ out: /* - * Do a lookup + open with a single request. - * - * If this succeeds, but some subsequent check in the vfs - * may_open() fails, the struct *file gets cleaned up (i.e. - * ceph_release gets called). So fear not! + * Do a lookup + open with a single request. If we get a non-existent + * file or symlink, return 1 so the VFS can retry. */ -/* - * flags - * path_lookup_open -> LOOKUP_OPEN - * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE - */ -struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir) +int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened) { - struct ceph_client *client = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; - struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); + struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct dentry *dn; int err; - int flags = nd->intent.open.flags - 1; /* silly vfs! */ - dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n", - dentry, dentry->d_name.len, dentry->d_name.name, flags, mode); + dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", + dir, dentry, dentry->d_name.len, dentry->d_name.name, + d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); + + if (dentry->d_name.len > NAME_MAX) + return -ENAMETOOLONG; + + err = ceph_init_dentry(dentry); + if (err < 0) + return err; /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) - return ERR_CAST(req); + return PTR_ERR(req); req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { @@ -238,16 +259,44 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); + if (err) + goto out_err; + + err = ceph_handle_snapdir(req, dentry, err); + if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + + if (d_unhashed(dentry)) { + dn = ceph_finish_lookup(req, dentry, err); + if (IS_ERR(dn)) + err = PTR_ERR(dn); + } else { + /* we were given a hashed negative dentry */ + dn = NULL; + } + if (err) + goto out_err; + if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + /* make vfs retry on splice, ENOENT, or symlink */ + dout("atomic_open finish_no_open on dn %p\n", dn); + err = finish_no_open(file, dn); + } else { + dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + ceph_init_acl(dentry, dentry->d_inode, dir); + *opened |= FILE_CREATED; + } + err = finish_open(file, dentry, ceph_open, opened); + } +out_err: + if (!req->r_err && req->r_target_inode) + ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("atomic_open result=%d\n", err); + return err; } int ceph_release(struct inode *inode, struct file *file) @@ -265,168 +314,11 @@ int ceph_release(struct inode *inode, struct file *file) kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); return 0; } /* - * build a vector of user pages - */ -static struct page **get_direct_page_vector(const char __user *data, - int num_pages, - loff_t off, size_t len) -{ - struct page **pages; - int rc; - - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); - if (!pages) - return ERR_PTR(-ENOMEM); - - down_read(¤t->mm->mmap_sem); - rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); - up_read(¤t->mm->mmap_sem); - if (rc < 0) - goto fail; - return pages; - -fail: - kfree(pages); - return ERR_PTR(rc); -} - -static void put_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - put_page(pages[i]); - kfree(pages); -} - -void ceph_release_page_vector(struct page **pages, int num_pages) -{ - int i; - - for (i = 0; i < num_pages; i++) - __free_pages(pages[i], 0); - kfree(pages); -} - -/* - * allocate a vector new pages - */ -struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) -{ - struct page **pages; - int i; - - pages = kmalloc(sizeof(*pages) * num_pages, flags); - if (!pages) - return ERR_PTR(-ENOMEM); - for (i = 0; i < num_pages; i++) { - pages[i] = __page_cache_alloc(flags); - if (pages[i] == NULL) { - ceph_release_page_vector(pages, i); - return ERR_PTR(-ENOMEM); - } - } - return pages; -} - -/* - * copy user data into a page vector - */ -static int copy_user_to_page_vector(struct page **pages, - const char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, PAGE_CACHE_SIZE-po, left); - bad = copy_from_user(page_address(pages[i]) + po, data, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - po += l - bad; - if (po == PAGE_CACHE_SIZE) { - po = 0; - i++; - } - } - return len; -} - -/* - * copy user data from a page vector into a user pointer - */ -static int copy_page_vector_to_user(struct page **pages, char __user *data, - loff_t off, size_t len) -{ - int i = 0; - int po = off & ~PAGE_CACHE_MASK; - int left = len; - int l, bad; - - while (left > 0) { - l = min_t(int, left, PAGE_CACHE_SIZE-po); - bad = copy_to_user(data, page_address(pages[i]) + po, l); - if (bad == l) - return -EFAULT; - data += l - bad; - left -= l - bad; - if (po) { - po += l - bad; - if (po == PAGE_CACHE_SIZE) - po = 0; - } - i++; - } - return len; -} - -/* - * Zero an extent within a page vector. Offset is relative to the - * start of the first page. - */ -static void zero_page_vector_range(int off, int len, struct page **pages) -{ - int i = off >> PAGE_CACHE_SHIFT; - - off &= ~PAGE_CACHE_MASK; - - dout("zero_page_vector_page %u~%u\n", off, len); - - /* leading partial page? */ - if (off) { - int end = min((int)PAGE_CACHE_SIZE, off + len); - dout("zeroing %d %p head from %d\n", i, pages[i], - (int)off); - zero_user_segment(pages[i], off, end); - len -= (end - off); - i++; - } - while (len >= PAGE_CACHE_SIZE) { - dout("zeroing %d %p len=%d\n", i, pages[i], len); - zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - i++; - } - /* trailing partial page? */ - if (len) { - dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); - zero_user_segment(pages[i], 0, len); - } -} - - -/* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) * @@ -436,13 +328,14 @@ static void zero_page_vector_range(int off, int len, struct page **pages) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool o_direct, + unsigned long buf_align) { - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; - int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ - int left, pages_left; + u64 pos, this_len, left; + int io_align, page_align; + int pages_left; int read; struct page **page_pos; int ret; @@ -456,58 +349,57 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (o_direct) + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; - ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), + ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; + page_pos, pages_left, page_align); if (ret == -ENOENT) ret = 0; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - if (ret > 0) { - int didpages = - ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; - - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - zero_page_vector_range(page_off + read, - pos - off - read, pages); + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + u64 tmp = min(this_len - ret, + inode->i_size - pos - ret); + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + tmp); + ceph_zero_page_vector_range(page_align + read + ret, + tmp, pages); + ret += tmp; } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) goto more; } - if (was_short) { - /* was original extent fully inside i_size? */ - if (pos + left <= inode->i_size) { - dout("zero tail\n"); - zero_page_vector_range(page_off + read, len - read, - pages); - read = len; - goto out; - } - - /* check i_size */ - *checkeof = 1; + if (read > 0) { + ret = read; + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; } -out: - if (ret >= 0) - ret = read; dout("striped_read returns %d\n", ret); return ret; } @@ -518,213 +410,361 @@ out: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { - struct inode *inode = file->f_dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; - int num_pages = calc_pages_for(off, len); - int ret; + u64 off = iocb->ki_pos; + int num_pages, ret; + size_t len = iov_iter_count(i); - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + len); + if (ret < 0) + return ret; if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, off, len); + while (iov_iter_count(i)) { + size_t start; + ssize_t n; - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - } else { - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); + n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start); + if (n < 0) + return n; - ret = filemap_write_and_wait(inode->i_mapping); - if (ret < 0) - goto done; + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, n, + pages, num_pages, checkeof, + 1, start); - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + ceph_put_page_vector(pages, num_pages, true); -done: - if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); - else + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < n) + break; + } + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + int copy = min_t(size_t, PAGE_SIZE, left); + l = copy_page_to_iter(pages[k++], 0, copy, i); + off += l; + left -= l; + if (l < copy) + break; + } + } ceph_release_page_vector(pages, num_pages); + } + + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } + dout("sync_read result %d\n", ret); return ret; } /* - * Write commit callback, called if we requested both an ACK and - * ONDISK commit reply from the OSD. + * Write commit request unsafe callback, called to tell us when a + * request is unsafe (that is, in flight--has been handed to the + * messenger to send to its target osd). It is called again when + * we've received a response message indicating the request is + * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request + * is completed early (and unsuccessfully) due to a timeout or + * interrupt. + * + * This is used if we requested both an ACK and ONDISK commit reply + * from the OSD. */ -static void sync_write_commit(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - dout("sync_write_commit %p tid %llu\n", req, req->r_tid); - spin_lock(&ci->i_unsafe_lock); - list_del_init(&req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, + unsafe ? "un" : ""); + if (unsafe) { + ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_item, + &ci->i_unsafe_writes); + spin_unlock(&ci->i_unsafe_lock); + } else { + spin_lock(&ci->i_unsafe_lock); + list_del_init(&req->r_unsafe_item); + spin_unlock(&ci->i_unsafe_lock); + ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); + } } + /* - * Synchronous write, straight from __user pointer or user pages (if - * O_DIRECT). + * Synchronous write, straight from __user pointer or user pages. * * If write spans object boundary, just do multiple writes. (For a * correct atomic write, we should e.g. take write locks on all * objects, rollback on failure, etc.) */ -static ssize_t ceph_sync_write(struct file *file, const char __user *data, - size_t left, loff_t *offset) +static ssize_t +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file->f_dentry->d_inode; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_client *client = ceph_inode_to_client(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; int num_pages; - long long unsigned pos; - u64 len; int written = 0; int flags; - int do_sync = 0; int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); - if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u %s\n", file, *offset, - (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_direct_write on file %p %lld~%u\n", file, pos, + (unsigned)count); - if (file->f_flags & O_APPEND) - pos = i_size_read(inode); - else - pos = *offset; - - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + left) >> PAGE_CACHE_SHIFT); + (pos + count) >> PAGE_CACHE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0) - flags |= CEPH_OSD_FLAG_ACK; - else - do_sync = 1; - /* - * we may need to do multiple writes here if we span an object - * boundary. this isn't atomic, unfortunately. :( - */ -more: - len = left; - req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, - ceph_vino(inode), pos, &len, - CEPH_OSD_OP_WRITE, flags, - ci->i_snap_realm->cached_context, - do_sync, - ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); - if (!req) - return -ENOMEM; - - num_pages = calc_pages_for(pos, len); + while (iov_iter_count(from) > 0) { + u64 len = iov_iter_single_seg_count(from); + size_t start; + ssize_t n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, + 2,/*include a 'startsync' command*/ + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } - if (file->f_flags & O_DIRECT) { - pages = get_direct_page_vector(data, num_pages, pos, len); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - goto out; + n = iov_iter_get_pages_alloc(from, &pages, len, &start); + if (unlikely(n < 0)) { + ret = n; + ceph_osdc_put_request(req); + break; } + num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE; /* * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+len) | (PAGE_CACHE_SIZE-1)); - } else { + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+n) | (PAGE_CACHE_SIZE-1)); + osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, + false, false); + + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + ceph_put_page_vector(pages, num_pages, false); + + ceph_osdc_put_request(req); + if (ret) + break; + pos += n; + written += n; + iov_iter_advance(from, n); + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } + + if (ret != -EOLDSNAPC && written > 0) { + iocb->ki_pos = pos; + ret = written; + } + return ret; +} + + +/* + * Synchronous write, straight from __user pointer or user pages. + * + * If write spans object boundary, just do multiple writes. (For a + * correct atomic write, we should e.g. take write locks on all + * objects, rollback on failure, etc.) + */ +static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_snap_context *snapc; + struct ceph_vino vino; + struct ceph_osd_request *req; + struct page **pages; + u64 len; + int num_pages; + int written = 0; + int flags; + int check_caps = 0; + int ret; + struct timespec mtime = CURRENT_TIME; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); + + if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + return -EROFS; + + dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + if (ret < 0) + return ret; + + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); + + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK; + + while ((len = iov_iter_count(from)) > 0) { + size_t left; + int n; + + snapc = ci->i_snap_realm->cached_context; + vino = ceph_vino(inode); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + vino, pos, &len, 1, + CEPH_OSD_OP_WRITE, flags, snapc, + ci->i_truncate_seq, + ci->i_truncate_size, + false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + + /* + * write from beginning of first page, + * regardless of io alignment + */ + num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; } - ret = copy_user_to_page_vector(pages, data, pos, len); + + left = len; + for (n = 0; n < num_pages; n++) { + size_t plen = min_t(size_t, left, PAGE_SIZE); + ret = copy_page_from_iter(pages[n], 0, plen, from); + if (ret != plen) { + ret = -EFAULT; + break; + } + left -= ret; + } + if (ret < 0) { ceph_release_page_vector(pages, num_pages); goto out; } - if ((file->f_flags & O_SYNC) == 0) { - /* get a second commit callback */ - req->r_safe_callback = sync_write_commit; - req->r_own_pages = 1; - } - } - req->r_pages = pages; - req->r_num_pages = num_pages; - req->r_inode = inode; + /* get a second commit callback */ + req->r_unsafe_callback = ceph_sync_write_unsafe; + req->r_inode = inode; - ret = ceph_osdc_start_request(&client->osdc, req, false); - if (!ret) { - if (req->r_safe_callback) { - /* - * Add to inode unsafe list only after we - * start_request so that a tid has been assigned. - */ - spin_lock(&ci->i_unsafe_lock); - list_add(&ci->i_unsafe_writes, &req->r_unsafe_item); - spin_unlock(&ci->i_unsafe_lock); - ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); - } - ret = ceph_osdc_wait_request(&client->osdc, req); - } + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, + false, true); - if (file->f_flags & O_DIRECT) - put_page_vector(pages, num_pages); - else if (file->f_flags & O_SYNC) - ceph_release_page_vector(pages, num_pages); + /* BUG_ON(vino.snap != CEPH_NOSNAP); */ + ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); out: - ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - left -= len; - if (left) - goto more; + ceph_osdc_put_request(req); + if (ret == 0) { + pos += len; + written += len; + + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } else + break; + } + if (ret != -EOLDSNAPC && written > 0) { ret = written; - *offset = pos; - if (pos > i_size_read(inode)) - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, - NULL); + iocb->ki_pos = pos; } return ret; } @@ -736,57 +776,69 @@ out: * * Hmm, the sync read case isn't actually async... should it be? */ -static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; - struct inode *inode = filp->f_dentry->d_inode; + struct ceph_file_info *fi = filp->private_data; + size_t len = iocb->ki_nbytes; + struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void *base = iov->iov_base; ssize_t ret; - int got = 0; + int want, got = 0; int checkeof = 0, read = 0; - dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", - inode, ceph_vinop(inode), pos, (unsigned)len, inode); again: - __ceph_do_pending_vmtruncate(inode); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, - &got, -1); + dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_CACHE; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; - dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)len, - ceph_cap_string(got)); + return ret; - if ((got & CEPH_CAP_FILE_CACHE) == 0 || + if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (fi->flags & CEPH_F_SYNC)) { + + dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); + /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else - ret = generic_file_aio_read(iocb, iov, nr_segs, pos); + ret = ceph_sync_read(iocb, to, &checkeof); + } else { + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", + inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, + ceph_cap_string(got)); -out: + ret = generic_file_read_iter(iocb, to); + } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, reading more\n"); + if (statret == 0 && iocb->ki_pos < inode->i_size && + ret < len) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + + iov_iter_advance(to, ret); read += ret; - base += ret; len -= ret; checkeof = 0; goto again; } } + if (ret >= 0) ret += read; @@ -803,90 +855,150 @@ out: * * If we are near ENOSPC, write synchronously. */ -static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_dentry->d_inode; + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; - loff_t endoff = pos + iov->iov_len; - int got = 0; - int ret, err; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + ssize_t count = iov_iter_count(from), written = 0; + int err, want, got; + loff_t pos = iocb->ki_pos; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + mutex_lock(&inode->i_mutex); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = file->f_mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + iov_iter_truncate(from, count); + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; - __ceph_do_pending_vmtruncate(inode); - dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - inode->i_size); - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - &got, endoff); - if (ret < 0) + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + err = -ENOSPC; goto out; + } - dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, - ceph_cap_string(got)); + dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", + inode, ceph_vinop(inode), pos, count, inode->i_size); + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + got = 0; + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + if (err < 0) + goto out; - if ((got & CEPH_CAP_FILE_BUFFER) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { - ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, - &iocb->ki_pos); - } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); - if (err < 0) - ret = err; + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", + inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); + + if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || + (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct iov_iter data; + mutex_unlock(&inode->i_mutex); + /* we might need to revert back to that point */ + data = *from; + if (file->f_flags & O_DIRECT) + written = ceph_sync_direct_write(iocb, &data); + else + written = ceph_sync_write(iocb, &data); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" + "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), + pos, (unsigned)count); + mutex_lock(&inode->i_mutex); + goto retry_snap; } + if (written > 0) + iov_iter_advance(from, written); + } else { + loff_t old_size = inode->i_size; + /* + * No need to acquire the i_truncate_mutex. Because + * the MDS revokes Fwb caps before sending truncate + * message to us. We can't get Fwb cap while there + * are pending vmtruncate. So write and vmtruncate + * can not run at the same time + */ + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + if (inode->i_size > old_size) + ceph_fscache_update_objectsize(inode); + mutex_unlock(&inode->i_mutex); } - if (ret >= 0) { - spin_lock(&inode->i_lock); - __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&inode->i_lock); + + if (written >= 0) { + int dirty; + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); } -out: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, + inode, ceph_vinop(inode), pos, (unsigned)count, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); - if (ret == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); - goto retry_snap; + if (written >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + written - 1, 1); + if (err < 0) + written = err; } - return ret; + goto out_unlocked; + +out: + mutex_unlock(&inode->i_mutex); +out_unlocked: + current->backing_dev_info = NULL; + return written ? written : err; } /* * llseek. be sure to verify file size on SEEK_END. */ -static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) +static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; int ret; mutex_lock(&inode->i_mutex); - __ceph_do_pending_vmtruncate(inode); - switch (origin) { - case SEEK_END: + + if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); if (ret < 0) { offset = ret; goto out; } + } + + switch (whence) { + case SEEK_END: offset += inode->i_size; break; case SEEK_CUR: @@ -902,37 +1014,239 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int origin) } offset += file->f_pos; break; + case SEEK_DATA: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + break; + case SEEK_HOLE: + if (offset >= inode->i_size) { + ret = -ENXIO; + goto out; + } + offset = inode->i_size; + break; + } + + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out: + mutex_unlock(&inode->i_mutex); + return offset; +} + +static inline void ceph_zero_partial_page( + struct inode *inode, loff_t offset, unsigned size) +{ + struct page *page; + pgoff_t index = offset >> PAGE_CACHE_SHIFT; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + wait_on_page_writeback(page); + zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size); + unlock_page(page); + page_cache_release(page); + } +} + +static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, + loff_t length) +{ + loff_t nearly = round_up(offset, PAGE_CACHE_SIZE); + if (offset < nearly) { + loff_t size = nearly - offset; + if (length < size) + size = length; + ceph_zero_partial_page(inode, offset, size); + offset += size; + length -= size; + } + if (length >= PAGE_CACHE_SIZE) { + loff_t size = round_down(length, PAGE_CACHE_SIZE); + truncate_pagecache_range(inode, offset, offset + size - 1); + offset += size; + length -= size; } + if (length) + ceph_zero_partial_page(inode, offset, length); +} - if (offset < 0 || offset > inode->i_sb->s_maxbytes) { - offset = -EINVAL; +static int ceph_zero_partial_object(struct inode *inode, + loff_t offset, loff_t *length) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + int ret = 0; + loff_t zero = 0; + int op; + + if (!length) { + op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; + length = &zero; + } else { + op = CEPH_OSD_OP_ZERO; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), + offset, length, + 1, op, + CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ONDISK, + NULL, 0, 0, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); goto out; } - /* Special lock needed here? */ - if (offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; + ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, + &inode->i_mtime); + + ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!ret) { + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + if (ret == -ENOENT) + ret = 0; } + ceph_osdc_put_request(req); out: + return ret; +} + +static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) +{ + int ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + s32 stripe_unit = ceph_file_layout_su(ci->i_layout); + s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + s32 object_size = ceph_file_layout_object_size(ci->i_layout); + u64 object_set_size = object_size * stripe_count; + u64 nearly, t; + + /* round offset up to next period boundary */ + nearly = offset + object_set_size - 1; + t = nearly; + nearly -= do_div(t, object_set_size); + + while (length && offset < nearly) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + while (length >= object_set_size) { + int i; + loff_t pos = offset; + for (i = 0; i < stripe_count; ++i) { + ret = ceph_zero_partial_object(inode, pos, NULL); + if (ret < 0) + return ret; + pos += stripe_unit; + } + offset += object_set_size; + length -= object_set_size; + } + while (length) { + loff_t size = length; + ret = ceph_zero_partial_object(inode, offset, &size); + if (ret < 0) + return ret; + offset += size; + length -= size; + } + return ret; +} + +static long ceph_fallocate(struct file *file, int mode, + loff_t offset, loff_t length) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = + &ceph_inode_to_client(inode)->client->osdc; + int want, got = 0; + int dirty; + int ret = 0; + loff_t endoff = 0; + loff_t size; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + if (ceph_snap(inode) != CEPH_NOSNAP) { + ret = -EROFS; + goto unlock; + } + + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + ret = -ENOSPC; + goto unlock; + } + + size = i_size_read(inode); + if (!(mode & FALLOC_FL_KEEP_SIZE)) + endoff = offset + length; + + if (fi->fmode & CEPH_FILE_MODE_LAZY) + want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; + else + want = CEPH_CAP_FILE_BUFFER; + + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + if (ret < 0) + goto unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset < size) + ceph_zero_pagecache_range(inode, offset, length); + ret = ceph_zero_objects(inode, offset, length); + } else if (endoff > size) { + truncate_pagecache_range(inode, size, -1); + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, NULL); + } + + if (!ret) { + spin_lock(&ci->i_ceph_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + } + + ceph_put_cap_refs(ci, got); +unlock: mutex_unlock(&inode->i_mutex); - return offset; + return ret; } const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = ceph_aio_read, - .aio_write = ceph_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = ceph_read_iter, + .write_iter = ceph_write_iter, .mmap = ceph_mmap, .fsync = ceph_fsync, + .lock = ceph_lock, + .flock = ceph_flock, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, + .fallocate = ceph_fallocate, }; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8f9b9fe8ef9..04c89c266ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1,8 +1,7 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -10,10 +9,13 @@ #include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> -#include <linux/pagevec.h> +#include <linux/posix_acl.h> +#include <linux/random.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" +#include "cache.h" +#include <linux/ceph/decode.h> /* * Ceph inode operations @@ -36,6 +38,13 @@ static void ceph_vmtruncate_work(struct work_struct *work); /* * find or create an inode, given the ceph ino number */ +static int ceph_set_ino_cb(struct inode *inode, void *data) +{ + ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; + inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); + return 0; +} + struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; @@ -88,6 +97,8 @@ const struct inode_operations ceph_file_iops = { .getxattr = ceph_getxattr, .listxattr = ceph_listxattr, .removexattr = ceph_removexattr, + .get_acl = ceph_get_acl, + .set_acl = ceph_set_acl, }; @@ -169,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f) * specified, copy the frag delegation info to the caller if * it is present. */ -u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found) +static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) { u32 t = ceph_frag_make(0, 0); struct ceph_inode_frag *frag; @@ -181,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, if (found) *found = 0; - mutex_lock(&ci->i_fragtree_mutex); while (1) { WARN_ON(!ceph_frag_contains_value(t, v)); frag = __ceph_find_frag(ci, t); @@ -210,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, } dout("choose_frag(%x) = %x\n", v, t); - mutex_unlock(&ci->i_fragtree_mutex); return t; } +u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, int *found) +{ + u32 ret; + mutex_lock(&ci->i_fragtree_mutex); + ret = __ceph_choose_frag(ci, v, pfrag, found); + mutex_unlock(&ci->i_fragtree_mutex); + return ret; +} + /* * Process dirfrag (delegation) info from the mds. Include leaf * fragment in tree ONLY if ndist > 0. Otherwise, only @@ -227,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode, u32 id = le32_to_cpu(dirinfo->frag); int mds = le32_to_cpu(dirinfo->auth); int ndist = le32_to_cpu(dirinfo->ndist); + int diri_auth = -1; int i; int err = 0; + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + diri_auth = ci->i_auth_cap->mds; + spin_unlock(&ci->i_ceph_lock); + mutex_lock(&ci->i_fragtree_mutex); - if (ndist == 0) { + if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ frag = __ceph_find_frag(ci, id); if (!frag) @@ -276,6 +300,75 @@ out: return err; } +static int ceph_fill_fragtree(struct inode *inode, + struct ceph_frag_tree_head *fragtree, + struct ceph_mds_reply_dirfrag *dirinfo) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_inode_frag *frag; + struct rb_node *rb_node; + int i; + u32 id, nsplits; + bool update = false; + + mutex_lock(&ci->i_fragtree_mutex); + nsplits = le32_to_cpu(fragtree->nsplits); + if (nsplits) { + i = prandom_u32() % nsplits; + id = le32_to_cpu(fragtree->splits[i].frag); + if (!__ceph_find_frag(ci, id)) + update = true; + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { + rb_node = rb_first(&ci->i_fragtree); + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) + update = true; + } + if (!update && dirinfo) { + id = le32_to_cpu(dirinfo->frag); + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) + update = true; + } + if (!update) + goto out_unlock; + + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); + rb_node = rb_first(&ci->i_fragtree); + for (i = 0; i < nsplits; i++) { + id = le32_to_cpu(fragtree->splits[i].frag); + frag = NULL; + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + if (ceph_frag_compare(frag->frag, id) >= 0) { + if (frag->frag != id) + frag = NULL; + else + rb_node = rb_next(rb_node); + break; + } + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + frag = NULL; + } + if (!frag) { + frag = __get_or_create_frag(ci, id); + if (IS_ERR(frag)) + continue; + } + frag->split_by = le32_to_cpu(fragtree->splits[i].by); + dout(" frag %x split by %d\n", frag->frag, frag->split_by); + } + while (rb_node) { + frag = rb_entry(rb_node, struct ceph_inode_frag, node); + rb_node = rb_next(rb_node); + rb_erase(&frag->node, &ci->i_fragtree); + kfree(frag); + } +out_unlock: + mutex_unlock(&ci->i_fragtree_mutex); + return 0; +} /* * initialize a newly allocated inode. @@ -291,12 +384,17 @@ struct inode *ceph_alloc_inode(struct super_block *sb) dout("alloc_inode %p\n", &ci->vfs_inode); + spin_lock_init(&ci->i_ceph_lock); + ci->i_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_release_count = 0; + atomic_set(&ci->i_release_count, 1); + atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -323,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); - ci->i_cap_exporting_mds = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_issued = 0; INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; @@ -333,6 +428,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) for (i = 0; i < CEPH_FILE_MODE_NUM; i++) ci->i_nr_by_mode[i] = 0; + mutex_init(&ci->i_truncate_mutex); ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; @@ -346,6 +442,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rd_ref = 0; ci->i_rdcache_ref = 0; ci->i_wr_ref = 0; + ci->i_wb_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; ci->i_shared_gen = 0; @@ -365,9 +462,19 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work); + ceph_fscache_inode_init(ci); + return &ci->vfs_inode; } +static void ceph_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ceph_inode_info *ci = ceph_inode(inode); + + kmem_cache_free(ceph_inode_cachep, ci); +} + void ceph_destroy_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -376,15 +483,17 @@ void ceph_destroy_inode(struct inode *inode) dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + ceph_fscache_unregister_inode_cookie(ci); + ceph_queue_caps_release(inode); /* * we may still have a snap_realm reference if there are stray - * caps in i_cap_exporting_issued or i_snap_caps. + * caps in i_snap_caps. */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -407,9 +516,18 @@ void ceph_destroy_inode(struct inode *inode) if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); - kmem_cache_free(ceph_inode_cachep, ci); + call_rcu(&inode->i_rcu, ceph_i_callback); } +int ceph_drop_inode(struct inode *inode) +{ + /* + * Positve dentry and corresponding inode are always accompanied + * in MDS reply. So no need to keep inode in the cache after + * dropping all its aliases. + */ + return 1; +} /* * Helpers to fill in size, ctime, mtime, and atime. We have to be @@ -435,15 +553,20 @@ int ceph_fill_file_size(struct inode *inode, int issued, dout("truncate_seq %u -> %u\n", ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; + + /* the MDS should have revoked these caps */ + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR | + CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're * not the only client referencing this file and we * don't hold those caps, then we need to check whether * the file is either opened or mmaped */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| - CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| - CEPH_CAP_FILE_EXCL)) || + if ((issued & (CEPH_CAP_FILE_CACHE| + CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || __ceph_caps_file_wanted(ci)) { ci->i_truncate_pending++; @@ -457,6 +580,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, truncate_size); ci->i_truncate_size = truncate_size; } + + if (queue_trunc) + ceph_fscache_invalidate(inode); + return queue_trunc; } @@ -469,7 +596,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -509,7 +638,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; @@ -535,20 +664,26 @@ static int fill_inode(struct inode *inode, unsigned long ttl_from, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); - int i; - int issued, implemented; + int issued = 0, implemented, new_issued; struct timespec mtime, atime, ctime; - u32 nsplits; struct ceph_buffer *xattr_blob = NULL; + struct ceph_cap *new_cap = NULL; int err = 0; - int queue_trunc = 0; + bool wake = false; + bool queue_trunc = false; + bool new_version = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* prealloc new cap struct */ + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) + new_cap = ceph_get_cap(mdsc, caps_reservation); + /* * prealloc xattr data, if it looks like we'll need it. only * if len > 4 (meaning there are actually xattrs; the first 4 @@ -561,52 +696,73 @@ static int fill_inode(struct inode *inode, iinfo->xattr_len); } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ - if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) - goto no_change; + if (ci->i_version == 0 || + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + le64_to_cpu(info->version) > (ci->i_version & ~1))) + new_version = true; issued = __ceph_caps_issued(ci, &implemented); issued |= implemented | __ceph_caps_dirty(ci); + new_issued = ~issued & le32_to_cpu(info->cap.caps); /* update inode */ ci->i_version = le64_to_cpu(info->version); inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && + (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = le32_to_cpu(info->mode); - inode->i_uid = le32_to_cpu(info->uid); - inode->i_gid = le32_to_cpu(info->gid); + inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); + inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, - inode->i_uid, inode->i_gid); + from_kuid(&init_user_ns, inode->i_uid), + from_kgid(&init_user_ns, inode->i_gid)); } - if ((issued & CEPH_CAP_LINK_EXCL) == 0) - inode->i_nlink = le32_to_cpu(info->nlink); - - /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); - queue_trunc = ceph_fill_file_size(inode, issued, - le32_to_cpu(info->truncate_seq), - le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); - ceph_fill_file_time(inode, issued, - le32_to_cpu(info->time_warp_seq), - &ctime, &mtime, &atime); - - ci->i_max_size = le64_to_cpu(info->max_size); - ci->i_layout = info->layout; - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && + (issued & CEPH_CAP_LINK_EXCL) == 0) + set_nlink(inode, le32_to_cpu(info->nlink)); + + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { + /* be careful with mtime, atime, size */ + ceph_decode_timespec(&atime, &info->atime); + ceph_decode_timespec(&mtime, &info->mtime); + ceph_decode_timespec(&ctime, &info->ctime); + ceph_fill_file_time(inode, issued, + le32_to_cpu(info->time_warp_seq), + &ctime, &mtime, &atime); + } + + if (new_version || + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, + le32_to_cpu(info->truncate_seq), + le64_to_cpu(info->truncate_size), + le64_to_cpu(info->size)); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + } /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ @@ -619,6 +775,7 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + ceph_forget_all_cached_acls(inode); xattr_blob = NULL; } @@ -641,20 +798,21 @@ static int fill_inode(struct inode *inode, case S_IFLNK: inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { - int symlen = iinfo->symlink_len; + u32 symlen = iinfo->symlink_len; char *sym; - BUG_ON(symlen != inode->i_size); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + + err = -EINVAL; + if (WARN_ON(symlen != inode->i_size)) + goto out; err = -ENOMEM; - sym = kmalloc(symlen+1, GFP_NOFS); + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); if (!sym) goto out; - memcpy(sym, iinfo->symlink, symlen); - sym[symlen] = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) ci->i_symlink = sym; else @@ -665,53 +823,30 @@ static int fill_inode(struct inode *inode, inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; + ci->i_dir_layout = iinfo->dir_layout; + ci->i_files = le64_to_cpu(info->files); ci->i_subdirs = le64_to_cpu(info->subdirs); ci->i_rbytes = le64_to_cpu(info->rbytes); ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ceph_decode_timespec(&ci->i_rctime, &info->rctime); - - /* set dir completion flag? */ - if (ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - dout(" marking %p complete (empty)\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; - ci->i_max_offset = 2; - } - - /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) - inode->i_size = ci->i_rbytes; break; default: pr_err("fill_inode %llx.%llx BAD mode 0%o\n", ceph_vinop(inode), inode->i_mode); } -no_change: - spin_unlock(&inode->i_lock); - - /* queue truncate if we saw i_size decrease */ - if (queue_trunc) - ceph_queue_vmtruncate(inode); - - /* populate frag tree */ - /* FIXME: move me up, if/when version reflects fragtree changes */ - nsplits = le32_to_cpu(info->fragtree.nsplits); - mutex_lock(&ci->i_fragtree_mutex); - for (i = 0; i < nsplits; i++) { - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); - struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); - - if (IS_ERR(frag)) - continue; - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); - dout(" frag %x split by %d\n", frag->frag, frag->split_by); + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + ceph_snap(inode) == CEPH_NOSNAP && + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); } - mutex_unlock(&ci->i_fragtree_mutex); /* were we issued a capability? */ if (info->cap.caps) { @@ -724,30 +859,41 @@ no_change: le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), - info->cap.flags, - caps_reservation); + info->cap.flags, &new_cap); + wake = true; } else { - spin_lock(&inode->i_lock); dout(" %p got snap_caps %s\n", inode, ceph_cap_string(le32_to_cpu(info->cap.caps))); ci->i_snap_caps |= le32_to_cpu(info->cap.caps); if (cap_fmode >= 0) __ceph_get_fmode(ci, cap_fmode); - spin_unlock(&inode->i_lock); } } else if (cap_fmode >= 0) { - pr_warning("mds issued no caps on %llx.%llx\n", + pr_warn("mds issued no caps on %llx.%llx\n", ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + spin_unlock(&ci->i_ceph_lock); + + if (wake) + wake_up_all(&ci->i_cap_wq); + + /* queue truncate if we saw i_size decrease */ + if (queue_trunc) + ceph_queue_vmtruncate(inode); + + /* populate frag tree */ + if (S_ISDIR(inode->i_mode)) + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); /* update delegation info? */ if (dirinfo) ceph_fill_dirfrag(inode, dirinfo); err = 0; - out: + if (new_cap) + ceph_put_cap(mdsc, new_cap); if (xattr_blob) ceph_buffer_put(xattr_blob); return err; @@ -772,14 +918,14 @@ static void update_dentry_lease(struct dentry *dentry, return; spin_lock(&dentry->d_lock); - dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n", - dentry, le16_to_cpu(lease->mask), duration, ttl); + dout("update_dentry_lease %p duration %lu ms ttl %lu\n", + dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = dentry->d_parent->d_inode; di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; - if (lease->mask == 0) + if (duration == 0) goto out_unlock; if (di->lease_gen == session->s_cap_gen && @@ -804,37 +950,6 @@ out_unlock: } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - di = ceph_dentry(dn); - - spin_lock(&inode->i_lock); - if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { - spin_unlock(&inode->i_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); - - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); -} - -/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -863,8 +978,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } else if (realdn) { dout("dn %p (%d) spliced with %p (%d) " "inode %p ino %llx.%llx\n", - dn, atomic_read(&dn->d_count), - realdn, atomic_read(&realdn->d_count), + dn, d_count(dn), + realdn, d_count(realdn), realdn->d_inode, ceph_vinop(realdn->d_inode)); dput(dn); dn = realdn; @@ -875,7 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - ceph_set_dentry_offset(dn); out: return dn; } @@ -896,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; - struct ceph_client *client = ceph_sb_to_client(sb); - int i = 0; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; dout("fill_trace %p is_dentry %d is_target %d\n", req, @@ -949,11 +1061,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (rinfo->head->is_dentry) { struct inode *dir = req->r_locked_dir; - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, - session, req->r_request_started, -1, - &req->r_caps_reservation); - if (err < 0) - return err; + if (dir) { + err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + session, req->r_request_started, -1, + &req->r_caps_reservation); + if (err < 0) + goto done; + } else { + WARN_ON_ONCE(1); + } + + if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { + struct qstr dname; + struct dentry *dn, *parent; + + BUG_ON(!rinfo->head->is_target); + BUG_ON(req->r_dentry); + + parent = d_find_any_alias(dir); + BUG_ON(!parent); + + dname.name = rinfo->dname; + dname.len = rinfo->dname_len; + dname.hash = full_name_hash(dname.name, dname.len); + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +retry_lookup: + dn = d_lookup(parent, &dname); + dout("d_lookup on parent=%p name=%.*s got %p\n", + parent, dname.len, dname.name, dn); + + if (!dn) { + dn = d_alloc(parent, &dname); + dout("d_alloc %p '%.*s' = %p\n", parent, + dname.len, dname.name, dn); + if (dn == NULL) { + dput(parent); + err = -ENOMEM; + goto done; + } + err = ceph_init_dentry(dn); + if (err < 0) { + dput(dn); + dput(parent); + goto done; + } + } else if (dn->d_inode && + (ceph_ino(dn->d_inode) != vino.ino || + ceph_snap(dn->d_inode) != vino.snap)) { + dout(" dn %p points to wrong inode %p\n", + dn, dn->d_inode); + d_delete(dn); + dput(dn); + goto retry_lookup; + } + + req->r_dentry = dn; + dput(parent); + } + } + + if (rinfo->head->is_target) { + vino.ino = le64_to_cpu(rinfo->targeti.in->ino); + vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); + + in = ceph_get_inode(sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto done; + } + req->r_target_inode = in; + + err = fill_inode(in, &rinfo->targeti, NULL, + session, req->r_request_started, + (!req->r_aborted && rinfo->head->result == 0) ? + req->r_fmode : -1, + &req->r_caps_reservation); + if (err < 0) { + pr_err("fill_inode badness %p %llx.%llx\n", + in, ceph_vinop(in)); + goto done; + } } /* @@ -961,8 +1149,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, * will have trouble splicing in the virtual snapdir later */ if (rinfo->head->is_dentry && !req->r_aborted && + req->r_locked_dir && (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, - client->mount_args->snapdir_name, + fsc->mount_options->snapdir_name, req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode @@ -988,14 +1177,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, /* do we have a dn lease? */ have_lease = have_dir_cap || - (le16_to_cpu(rinfo->dlease->mask) & - CEPH_LOCK_DN); - + le32_to_cpu(rinfo->dlease->duration_ms); if (!have_lease) dout("fill_trace no dentry lease or dir cap\n"); /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1004,9 +1194,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); - /* d_move screws up d_subdirs order */ - ceph_i_clear(dir, CEPH_I_COMPLETE); - d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, @@ -1018,15 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* take overwritten dentry's readdir offset */ - dout("dn %p gets %p offset %lld (old offset %lld)\n", - req->r_old_dentry, dn, ceph_dentry(dn)->offset, + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); - ceph_dentry(req->r_old_dentry)->offset = - ceph_dentry(dn)->offset; dn = req->r_old_dentry; /* use old_dentry */ - in = dn->d_inode; } /* null dentry? */ @@ -1048,106 +1234,87 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, } /* attach proper inode */ - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); if (!dn->d_inode) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_trace bad get_inode " - "%llx.%llx\n", vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } + ceph_dir_clear_complete(dir); + ihold(in); dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); - } else if (ceph_ino(in) == vino.ino && - ceph_snap(in) == vino.snap) { - igrab(in); - } else { + } else if (dn->d_inode && dn->d_inode != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, in, ceph_ino(in), ceph_snap(in), - vino.ino, vino.snap); + dn, dn->d_inode, ceph_vinop(dn->d_inode), + ceph_vinop(in)); have_lease = false; - in = NULL; } if (have_lease) update_dentry_lease(dn, rinfo->dlease, session, req->r_request_started); dout(" final dn %p\n", dn); - i++; - } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || - req->r_op == CEPH_MDS_OP_MKSNAP) { + } else if (!req->r_aborted && + (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || + req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - pr_err("fill_inode get_inode badness %llx.%llx\n", - vino.ino, vino.snap); - err = PTR_ERR(in); - d_delete(dn); - goto done; - } + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); + ihold(in); dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); - rinfo->head->is_dentry = 1; /* fool notrace handlers */ } +done: + dout("fill_trace done err=%d\n", err); + return err; +} - if (rinfo->head->is_target) { - vino.ino = le64_to_cpu(rinfo->targeti.in->ino); - vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); +/* + * Prepopulate our cache with readdir results, leases, etc. + */ +static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, + struct ceph_mds_session *session) +{ + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + int i, err = 0; - if (in == NULL || ceph_ino(in) != vino.ino || - ceph_snap(in) != vino.snap) { - in = ceph_get_inode(sb, vino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } - } - req->r_target_inode = in; + for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_vino vino; + struct inode *in; + int rc; - err = fill_inode(in, - &rinfo->targeti, NULL, - session, req->r_request_started, - (le32_to_cpu(rinfo->head->result) == 0) ? - req->r_fmode : -1, - &req->r_caps_reservation); - if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", - in, ceph_vinop(in)); - goto done; + vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); + vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + + in = ceph_get_inode(req->r_dentry->d_sb, vino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + dout("new_inode badness got %d\n", err); + continue; + } + rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (rc < 0) { + pr_err("fill_inode badness on %p got %d\n", in, rc); + err = rc; + continue; } } -done: - dout("fill_trace done err=%d\n", err); return err; } -/* - * Prepopulate our cache with readdir results, leases, etc. - */ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1156,11 +1323,26 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct qstr dname; struct dentry *dn; struct inode *in; - int err = 0, i; + int err = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u64 frag = le32_to_cpu(rhead->args.readdir.frag); struct ceph_dentry_info *di; + u64 r_readdir_offset = req->r_readdir_offset; + u32 frag = le32_to_cpu(rhead->args.readdir.frag); + + if (rinfo->dir_dir && + le32_to_cpu(rinfo->dir_dir->frag) != frag) { + dout("readdir_prepopulate got new frag %x -> %x\n", + frag, le32_to_cpu(rinfo->dir_dir->frag)); + frag = le32_to_cpu(rinfo->dir_dir->frag); + if (ceph_frag_is_leftmost(frag)) + r_readdir_offset = 2; + else + r_readdir_offset = 0; + } + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(parent->d_inode); @@ -1174,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); } + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1198,9 +1381,12 @@ retry_lookup: err = -ENOMEM; goto out; } - err = ceph_init_dentry(dn); - if (err < 0) + ret = ceph_init_dentry(dn); + if (ret < 0) { + dput(dn); + err = ret; goto out; + } } else if (dn->d_inode && (ceph_ino(dn->d_inode) != vino.ino || ceph_snap(dn->d_inode) != vino.snap)) { @@ -1211,48 +1397,58 @@ retry_lookup: goto retry_lookup; } else { /* reorder parent's d_subdirs */ - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); + spin_lock(&parent->d_lock); + spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); list_move(&dn->d_u.d_child, &parent->d_subdirs); spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); + spin_unlock(&parent->d_lock); } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); - /* inode */ if (dn->d_inode) { in = dn->d_inode; } else { in = ceph_get_inode(parent->d_sb, vino); - if (in == NULL) { + if (IS_ERR(in)) { dout("new_inode badness\n"); - d_delete(dn); + d_drop(dn); dput(dn); - err = -ENOMEM; + err = PTR_ERR(in); goto out; } - dn = splice_dentry(dn, in, NULL); - if (IS_ERR(dn)) - dn = NULL; } if (fill_inode(in, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); + if (!dn->d_inode) + iput(in); + d_drop(dn); goto next_item; } - if (dn) - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, - req->r_request_started); + + if (!dn->d_inode) { + dn = splice_dentry(dn, in, NULL); + if (IS_ERR(dn)) { + err = PTR_ERR(dn); + dn = NULL; + goto next_item; + } + } + + di = dn->d_fsdata; + di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + + update_dentry_lease(dn, rinfo->dir_dlease[i], + req->r_session, + req->r_request_started); next_item: if (dn) dput(dn); } - req->r_did_prepopulate = true; + if (err == 0) + req->r_did_prepopulate = true; out: if (snapdir) { @@ -1268,7 +1464,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) struct ceph_inode_info *ci = ceph_inode(inode); int ret = 0; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); inode->i_size = size; inode->i_blocks = (size + (1 << 9) - 1) >> 9; @@ -1278,7 +1474,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) (ci->i_reported_size << 1) < ci->i_max_size) ret = 1; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1288,12 +1484,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) */ void ceph_queue_writeback(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - igrab(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); } } @@ -1313,55 +1510,13 @@ static void ceph_writeback_work(struct work_struct *work) */ void ceph_queue_invalidate(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - igrab(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); - } -} - -/* - * invalidate any pages that are not dirty or under writeback. this - * includes pages that are clean and mapped. - */ -static void ceph_invalidate_nondirty_pages(struct address_space *mapping) -{ - struct pagevec pvec; - pgoff_t next = 0; - int i; - - pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - pgoff_t index; - int skip_page = - (PageDirty(page) || PageWriteback(page)); - - if (!skip_page) - skip_page = !trylock_page(page); - - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ - index = page->index; - if (index > next) - next = index; - next++; - - if (skip_page) - continue; - - generic_error_remove_page(mapping, page); - unlock_page(page); - } - pagevec_release(&pvec); - cond_resched(); + iput(inode); } } @@ -1377,44 +1532,47 @@ static void ceph_invalidate_work(struct work_struct *work) u32 orig_gen; int check = 0; - spin_lock(&inode->i_lock); + mutex_lock(&ci->i_truncate_mutex); + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); - /* nevermind! */ - ci->i_rdcache_revoking = 0; - spin_unlock(&inode->i_lock); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); goto out; } orig_gen = ci->i_rdcache_gen; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - ceph_invalidate_nondirty_pages(inode->i_mapping); + truncate_pagecache(inode, 0); - spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + spin_lock(&ci->i_ceph_lock); + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); + if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE)) + check = 1; } - spin_unlock(&inode->i_lock); - + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); +out: if (check) ceph_check_caps(ci, 0, NULL); -out: iput(inode); } /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1425,9 +1583,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); iput(inode); } @@ -1439,19 +1595,19 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + ihold(inode); + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - igrab(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); + iput(inode); } } /* - * called with i_mutex held. - * * Make sure any pending truncation is applied before doing anything * that may depend on it. */ @@ -1459,13 +1615,15 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); u64 to; - int wrbuffer_refs, wake = 0; + int wrbuffer_refs, finish = 0; + mutex_lock(&ci->i_truncate_mutex); retry: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { dout("__do_pending_vmtruncate %p none pending\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + mutex_unlock(&ci->i_truncate_mutex); return; } @@ -1476,32 +1634,39 @@ retry: if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; } + /* there should be no reader or writer */ + WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); + to = ci->i_truncate_size; wrbuffer_refs = ci->i_wrbuffer_ref; dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, ci->i_truncate_pending, to); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - truncate_inode_pages(inode->i_mapping, to); + truncate_pagecache(inode, to); - spin_lock(&inode->i_lock); - ci->i_truncate_pending--; - if (ci->i_truncate_pending == 0) - wake = 1; - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + if (to == ci->i_truncate_size) { + ci->i_truncate_pending = 0; + finish = 1; + } + spin_unlock(&ci->i_ceph_lock); + if (!finish) + goto retry; + + mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); - if (wake) - wake_up(&ci->i_cap_wq); -} + wake_up_all(&ci->i_cap_wq); +} /* * symlinks @@ -1516,6 +1681,12 @@ static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, .follow_link = ceph_sym_follow_link, + .setattr = ceph_setattr, + .getattr = ceph_getattr, + .setxattr = ceph_setxattr, + .getxattr = ceph_getxattr, + .listxattr = ceph_listxattr, + .removexattr = ceph_removexattr, }; /* @@ -1525,20 +1696,18 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; + int inode_dirty_flags = 0; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - __ceph_do_pending_vmtruncate(inode); - err = inode_change_ok(inode, attr); if (err != 0) return err; @@ -1548,32 +1717,36 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (IS_ERR(req)) return PTR_ERR(req); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, - inode->i_uid, attr->ia_uid); + from_kuid(&init_user_ns, inode->i_uid), + from_kuid(&init_user_ns, attr->ia_uid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_uid = attr->ia_uid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_uid != inode->i_uid) { - req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid); + !uid_eq(attr->ia_uid, inode->i_uid)) { + req->r_args.setattr.uid = cpu_to_le32( + from_kuid(&init_user_ns, attr->ia_uid)); mask |= CEPH_SETATTR_UID; release |= CEPH_CAP_AUTH_SHARED; } } if (ia_valid & ATTR_GID) { dout("setattr %p gid %d -> %d\n", inode, - inode->i_gid, attr->ia_gid); + from_kgid(&init_user_ns, inode->i_gid), + from_kgid(&init_user_ns, attr->ia_gid)); if (issued & CEPH_CAP_AUTH_EXCL) { inode->i_gid = attr->ia_gid; dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || - attr->ia_gid != inode->i_gid) { - req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid); + !gid_eq(attr->ia_gid, inode->i_gid)) { + req->r_args.setattr.gid = cpu_to_le32( + from_kgid(&init_user_ns, attr->ia_gid)); mask |= CEPH_SETATTR_GID; release |= CEPH_CAP_AUTH_SHARED; } @@ -1586,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dirtied |= CEPH_CAP_AUTH_EXCL; } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || attr->ia_mode != inode->i_mode) { + inode->i_mode = attr->ia_mode; req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode); mask |= CEPH_SETATTR_MODE; release |= CEPH_CAP_AUTH_SHARED; @@ -1691,28 +1865,40 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); inode->i_ctime = CURRENT_TIME; } release &= issued; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); + + if (inode_dirty_flags) + __mark_inode_dirty(inode, inode_dirty_flags); + + if (ia_valid & ATTR_MODE) { + err = posix_acl_chmod(inode, attr->ia_mode); + if (err) + goto out_put; + } if (mask) { - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - __ceph_do_pending_vmtruncate(inode); + if (mask & CEPH_SETATTR_SIZE) + __ceph_do_pending_vmtruncate(inode); return err; out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); +out_put: ceph_mdsc_put_request(req); return err; } @@ -1723,8 +1909,8 @@ out: */ int ceph_do_getattr(struct inode *inode, int mask) { - struct ceph_client *client = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; int err; @@ -1733,14 +1919,15 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -1756,10 +1943,15 @@ int ceph_do_getattr(struct inode *inode, int mask) */ int ceph_permission(struct inode *inode, int mask) { - int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + int err; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); if (!err) - err = generic_permission(inode, mask, NULL); + err = generic_permission(inode, mask); return err; } @@ -1777,13 +1969,17 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); if (!err) { generic_fillattr(inode, stat); - stat->ino = inode->i_ino; + stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); if (ceph_snap(inode) != CEPH_NOSNAP) stat->dev = ceph_snap(inode); else stat->dev = 0; if (S_ISDIR(inode->i_mode)) { - stat->size = ci->i_rbytes; + if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), + RBYTES)) + stat->size = ci->i_rbytes; + else + stat->size = ci->i_files + ci->i_subdirs; stat->blocks = 0; stat->blksize = 65536; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index d085f07756b..a822a6e5829 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -1,8 +1,9 @@ +#include <linux/ceph/ceph_debug.h> #include <linux/in.h> -#include "ioctl.h" #include "super.h" -#include "ceph_debug.h" +#include "mds_client.h" +#include "ioctl.h" /* @@ -14,18 +15,17 @@ */ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) { - struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); struct ceph_ioctl_layout l; int err; - err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); if (!err) { l.stripe_unit = ceph_file_layout_su(ci->i_layout); l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); l.object_size = ceph_file_layout_object_size(ci->i_layout); l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); - l.preferred_osd = - (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); + l.preferred_osd = (s32)-1; if (copy_to_user(arg, &l, sizeof(l))) return -EFAULT; } @@ -33,45 +33,84 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) return err; } +static long __validate_layout(struct ceph_mds_client *mdsc, + struct ceph_ioctl_layout *l) +{ + int i, err; + + /* validate striping parameters */ + if ((l->object_size & ~PAGE_MASK) || + (l->stripe_unit & ~PAGE_MASK) || + (l->stripe_unit != 0 && + ((unsigned)l->object_size % (unsigned)l->stripe_unit))) + return -EINVAL; + + /* make sure it's a valid data pool */ + mutex_lock(&mdsc->mutex); + err = -EINVAL; + for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) + if (mdsc->mdsmap->m_data_pg_pools[i] == l->data_pool) { + err = 0; + break; + } + mutex_unlock(&mdsc->mutex); + if (err) + return err; + + return 0; +} + static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { - struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; - int err, i; + struct ceph_inode_info *ci = ceph_inode(file_inode(file)); + struct ceph_ioctl_layout nl; + int err; - /* copy and validate */ if (copy_from_user(&l, arg, sizeof(l))) return -EFAULT; - if ((l.object_size & ~PAGE_MASK) || - (l.stripe_unit & ~PAGE_MASK) || - !l.stripe_unit || - (l.object_size && - (unsigned)l.object_size % (unsigned)l.stripe_unit)) - return -EINVAL; + /* validate changed params against current layout */ + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + if (err) + return err; - /* make sure it's a valid data pool */ - if (l.data_pool > 0) { - mutex_lock(&mdsc->mutex); - err = -EINVAL; - for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++) - if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) { - err = 0; - break; - } - mutex_unlock(&mdsc->mutex); - if (err) - return err; - } + memset(&nl, 0, sizeof(nl)); + if (l.stripe_count) + nl.stripe_count = l.stripe_count; + else + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); + if (l.stripe_unit) + nl.stripe_unit = l.stripe_unit; + else + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); + if (l.object_size) + nl.object_size = l.object_size; + else + nl.object_size = ceph_file_layout_object_size(ci->i_layout); + if (l.data_pool) + nl.data_pool = l.data_pool; + else + nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout); + + /* this is obsolete, and always -1 */ + nl.preferred_osd = le64_to_cpu(-1); + + err = __validate_layout(mdsc, &nl); + if (err) + return err; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -81,10 +120,53 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_object_size = cpu_to_le32(l.object_size); req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool); - req->r_args.setlayout.layout.fl_pg_preferred = - cpu_to_le32(l.preferred_osd); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); + return err; +} + +/* + * Set a layout policy on a directory inode. All items in the tree + * rooted at this inode will inherit this layout on creation, + * (It doesn't apply retroactively ) + * unless a subdirectory has its own layout policy. + */ +static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_request *req; + struct ceph_ioctl_layout l; + int err; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + + /* copy and validate */ + if (copy_from_user(&l, arg, sizeof(l))) + return -EFAULT; + + err = __validate_layout(mdsc, &l); + if (err) + return err; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT, + USE_AUTH_MDS); + + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + req->r_args.setlayout.layout.fl_stripe_unit = + cpu_to_le32(l.stripe_unit); + req->r_args.setlayout.layout.fl_stripe_count = + cpu_to_le32(l.stripe_count); + req->r_args.setlayout.layout.fl_object_size = + cpu_to_le32(l.object_size); + req->r_args.setlayout.layout.fl_pg_pool = + cpu_to_le32(l.data_pool); + + err = ceph_mdsc_do_request(mdsc, inode, req); ceph_mdsc_put_request(req); return err; } @@ -96,21 +178,29 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) { struct ceph_ioctl_dataloc dl; - struct inode *inode = file->f_dentry->d_inode; + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = + &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; - struct ceph_object_layout ol; struct ceph_pg pgid; + int r; /* copy and validate */ if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; down_read(&osdc->map_sem); - ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, - &dl.object_no, &dl.object_offset, &olen); + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, + &dl.object_no, &dl.object_offset, + &olen); + if (r < 0) { + up_read(&osdc->map_sem); + return -EIO; + } dl.file_offset -= dl.object_offset; dl.object_size = ceph_file_layout_object_size(ci->i_layout); dl.block_size = ceph_file_layout_su(ci->i_layout); @@ -121,10 +211,16 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, - osdc->osdmap); - pgid = ol.ol_pgid; + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } + dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = @@ -143,6 +239,35 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return 0; } +static long ceph_ioctl_lazyio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { + spin_lock(&ci->i_ceph_lock); + ci->i_nr_by_mode[fi->fmode]--; + fi->fmode |= CEPH_FILE_MODE_LAZY; + ci->i_nr_by_mode[fi->fmode]++; + spin_unlock(&ci->i_ceph_lock); + dout("ioctl_layzio: file %p marked lazy\n", file); + + ceph_check_caps(ci, 0, NULL); + } else { + dout("ioctl_layzio: file %p already lazy\n", file); + } + return 0; +} + +static long ceph_ioctl_syncio(struct file *file) +{ + struct ceph_file_info *fi = file->private_data; + + fi->flags |= CEPH_F_SYNC; + return 0; +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); @@ -153,8 +278,18 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SET_LAYOUT: return ceph_ioctl_set_layout(file, (void __user *)arg); + case CEPH_IOC_SET_LAYOUT_POLICY: + return ceph_ioctl_set_layout_policy(file, (void __user *)arg); + case CEPH_IOC_GET_DATALOC: return ceph_ioctl_get_dataloc(file, (void __user *)arg); + + case CEPH_IOC_LAZYIO: + return ceph_ioctl_lazyio(file); + + case CEPH_IOC_SYNCIO: + return ceph_ioctl_syncio(file); } + return -ENOTTY; } diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index 25e4f1a9d05..c77028afb1e 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -6,10 +6,36 @@ #define CEPH_IOCTL_MAGIC 0x97 -/* just use u64 to align sanely on all archs */ +/* + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy + * CEPH_IOC_SET_LAYOUT - set file layout + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy + * + * The file layout specifies how file data is striped over objects in + * the distributed object store, which object pool they belong to (if + * it differs from the default), and an optional 'preferred osd' to + * store them on. + * + * Files get a new layout based on the policy set on the containing + * directory or one of its ancestors. The GET_LAYOUT ioctl will let + * you examine the layout for a file or the policy on a directory. + * + * SET_LAYOUT will let you set a layout on a newly created file. This + * only works immediately after the file is created and before any + * data is written to it. + * + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) + * on a directory that will apply to any new files created in that + * directory (or any child directory that doesn't specify a layout of + * its own). + */ + +/* use u64 to align sanely on all archs */ struct ceph_ioctl_layout { __u64 stripe_unit, stripe_count, object_size; __u64 data_pool; + + /* obsolete. new values ignored, always return -1 */ __s64 preferred_osd; }; @@ -17,8 +43,12 @@ struct ceph_ioctl_layout { struct ceph_ioctl_layout) #define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ struct ceph_ioctl_layout) +#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \ + struct ceph_ioctl_layout) /* + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster + * * Extract identity, address of the OSD and object storing a given * file offset. */ @@ -37,4 +67,34 @@ struct ceph_ioctl_dataloc { #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ struct ceph_ioctl_dataloc) +/* + * CEPH_IOC_LAZYIO - relax consistency + * + * Normally Ceph switches to synchronous IO when multiple clients have + * the file open (and or more for write). Reads and writes bypass the + * page cache and go directly to the OSD. Setting this flag on a file + * descriptor will allow buffered IO for this file in cases where the + * application knows it won't interfere with other nodes (or doesn't + * care). + */ +#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) + +/* + * CEPH_IOC_SYNCIO - force synchronous IO + * + * This ioctl sets a file flag that forces the synchronous IO that + * bypasses the page cache, even if it is not necessary. This is + * essentially the opposite behavior of IOC_LAZYIO. This forces the + * same read/write path as a file opened by multiple clients when one + * or more of those clients is opened for write. + * + * Note that this type of sync IO takes a different path than a file + * opened with O_SYNC/D_SYNC (writes hit the page cache and are + * immediately flushed on page boundaries). It is very similar to + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes + * are not copied (user page must remain stable) and O_DIRECT writes + * have alignment restrictions (on the buffer and file offset). + */ +#define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) + #endif diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c new file mode 100644 index 00000000000..fbc39c47bac --- /dev/null +++ b/fs/ceph/locks.c @@ -0,0 +1,332 @@ +#include <linux/ceph/ceph_debug.h> + +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/random.h> + +#include "super.h" +#include "mds_client.h" +#include <linux/ceph/pagelist.h> + +static u64 lock_secret; + +static inline u64 secure_addr(void *addr) +{ + u64 v = lock_secret ^ (u64)(unsigned long)addr; + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + v |= (1ULL << 63); + return v; +} + +void __init ceph_flock_init(void) +{ + get_random_bytes(&lock_secret, sizeof(lock_secret)); +} + +/** + * Implement fcntl and flock locking functions. + */ +static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, + int cmd, u8 wait, struct file_lock *fl) +{ + struct inode *inode = file_inode(file); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_request *req; + int err; + u64 length = 0; + u64 owner; + + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); + if (IS_ERR(req)) + return PTR_ERR(req); + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + + owner = secure_addr(fl->fl_owner); + + dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " + "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, + (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, + wait, fl->fl_type); + + req->r_args.filelock_change.rule = lock_type; + req->r_args.filelock_change.type = cmd; + req->r_args.filelock_change.owner = cpu_to_le64(owner); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); + req->r_args.filelock_change.length = cpu_to_le64(length); + req->r_args.filelock_change.wait = wait; + + err = ceph_mdsc_do_request(mdsc, inode, req); + + if (operation == CEPH_MDS_OP_GETFILELOCK) { + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } + ceph_mdsc_put_request(req); + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); + return err; +} + +/** + * Attempt to set an fcntl lock. + * For now, this just goes away to the server. Later it may be more awesome. + */ +int ceph_lock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 0; + u16 op = CEPH_MDS_OP_SETFILELOCK; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_lock, fl_owner: %p", fl->fl_owner); + + /* set wait bit as appropriate, then make command as Ceph expects it*/ + if (IS_GETLK(cmd)) + op = CEPH_MDS_OP_GETFILELOCK; + else if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + if (!err) { + if (op != CEPH_MDS_OP_GETFILELOCK) { + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", + err); + } + } + + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + } + return err; +} + +int ceph_flock(struct file *file, int cmd, struct file_lock *fl) +{ + u8 lock_cmd; + int err; + u8 wait = 0; + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + /* No mandatory locks */ + if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + dout("ceph_flock, fl_file: %p", fl->fl_file); + + if (IS_SETLKW(cmd)) + wait = 1; + + if (F_RDLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->fl_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK; + + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, + file, lock_cmd, wait, fl); + if (!err) { + err = flock_lock_file_wait(file, fl); + if (err) { + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on flock_lock_file_wait, undid lock", err); + } + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); + } + return err; +} + +/** + * Must be called with lock_flocks() already held. Fills in the passed + * counter variables, so you can prepare pagelist metadata before calling + * ceph_encode_locks. + */ +void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) +{ + struct file_lock *lock; + + *fcntl_count = 0; + *flock_count = 0; + + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) + ++(*fcntl_count); + else if (lock->fl_flags & FL_FLOCK) + ++(*flock_count); + } + dout("counted %d flock locks and %d fcntl locks", + *flock_count, *fcntl_count); +} + +/** + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with inode->i_lock already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. + */ +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) +{ + struct file_lock *lock; + int err = 0; + int seen_fcntl = 0; + int seen_flock = 0; + int l = 0; + + dout("encoding %d flock and %d fcntl locks", num_flock_locks, + num_fcntl_locks); + + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_POSIX) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } + } + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { + if (lock->fl_flags & FL_FLOCK) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; + } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; + } + } +fail: + return err; +} + +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 416c08d315d..92a2548278f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1,16 +1,22 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> +#include <linux/fs.h> #include <linux/wait.h> #include <linux/slab.h> +#include <linux/gfp.h> #include <linux/sched.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> -#include "mds_client.h" -#include "mon_client.h" #include "super.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" -#include "pagelist.h" +#include "mds_client.h" + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/pagelist.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * A cluster of MDS (metadata server) daemons is responsible for @@ -37,6 +43,12 @@ * are no longer valid. */ +struct ceph_reconnect_state { + int nr_caps; + struct ceph_pagelist *pagelist; + bool flock; +}; + static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); @@ -51,7 +63,8 @@ static const struct ceph_connection_operations mds_con_ops; * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, - struct ceph_mds_reply_info_in *info) + struct ceph_mds_reply_info_in *info, + u64 features) { int err = -EIO; @@ -65,6 +78,12 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + else + memset(&info->dir_layout, 0, sizeof(info->dir_layout)); + ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; @@ -79,12 +98,13 @@ bad: * target inode. */ static int parse_reply_info_trace(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + u64 features) { int err; if (info->head->is_dentry) { - err = parse_reply_info_in(p, end, &info->diri); + err = parse_reply_info_in(p, end, &info->diri, features); if (err < 0) goto out_bad; @@ -105,7 +125,7 @@ static int parse_reply_info_trace(void **p, void *end, } if (info->head->is_target) { - err = parse_reply_info_in(p, end, &info->targeti); + err = parse_reply_info_in(p, end, &info->targeti, features); if (err < 0) goto out_bad; } @@ -125,7 +145,8 @@ out_bad: * parse readdir results */ static int parse_reply_info_dir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + u64 features) { u32 num, i = 0; int err; @@ -145,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end, if (num == 0) goto done; - /* alloc large array */ - info->dir_nr = num; - info->dir_in = kcalloc(num, sizeof(*info->dir_in) + - sizeof(*info->dir_dname) + - sizeof(*info->dir_dname_len) + - sizeof(*info->dir_dlease), - GFP_NOFS); - if (info->dir_in == NULL) { - err = -ENOMEM; - goto out_bad; - } + BUG_ON(!info->dir_in); info->dir_dname = (void *)(info->dir_in + num); info->dir_dname_len = (void *)(info->dir_dname + num); info->dir_dlease = (void *)(info->dir_dname_len + num); + if ((unsigned long)(info->dir_dlease + num) > + (unsigned long)info->dir_in + info->dir_buf_size) { + pr_err("dir contents are larger than expected\n"); + WARN_ON(1); + goto bad; + } + info->dir_nr = num; while (num) { /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); @@ -173,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end, *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i]); + err = parse_reply_info_in(p, end, &info->dir_in[i], features); if (err < 0) goto out_bad; i++; @@ -193,10 +211,74 @@ out_bad: } /* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_READDIR || + info->head->op == CEPH_MDS_OP_LSSNAP) + return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; +} + +/* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + u64 features) { void *p, *end; u32 len; @@ -209,15 +291,17 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_trace(&p, p+len, info); + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; } - /* dir content */ + /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_dir(&p, p+len, info); + ceph_decode_need(&p, end, len, bad); + err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; } @@ -241,7 +325,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - kfree(info->dir_in); + if (!info->dir_in) + return; + free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); } @@ -279,9 +365,10 @@ void ceph_put_mds_session(struct ceph_mds_session *s) dout("mdsc put_session %p %d -> %d\n", s, atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); if (atomic_dec_and_test(&s->s_ref)) { - if (s->s_authorizer) - s->s_mdsc->client->monc.auth->ops->destroy_authorizer( - s->s_mdsc->client->monc.auth, s->s_authorizer); + if (s->s_auth.authorizer) + ceph_auth_destroy_authorizer( + s->s_mdsc->fsc->client->monc.auth, + s->s_auth.authorizer); kfree(s); } } @@ -328,6 +415,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); @@ -338,15 +428,13 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_seq = 0; mutex_init(&s->s_mutex); - ceph_con_init(mdsc->client->msgr, &s->s_con); - s->s_con.private = s; - s->s_con.ops = &mds_con_ops; - s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; - s->s_con.peer_name.num = cpu_to_le64(mds); + ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); - spin_lock_init(&s->s_cap_lock); + spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; - s->s_cap_ttl = 0; + s->s_cap_ttl = jiffies - 1; + + spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); @@ -356,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; + s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_LIST_HEAD(&s->s_cap_releases_done); @@ -382,7 +471,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->sessions[mds] = s; atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ - ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); return s; @@ -422,34 +512,38 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { + if (req->r_reply) ceph_msg_put(req->r_reply); - destroy_reply_info(&req->r_reply_info); - } if (req->r_inode) { - ceph_put_cap_refs(ceph_inode(req->r_inode), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); iput(req->r_inode); } if (req->r_locked_dir) - ceph_put_cap_refs(ceph_inode(req->r_locked_dir), - CEPH_CAP_PIN); + ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); if (req->r_target_inode) iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); - if (req->r_old_dentry) { - ceph_put_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + if (req->r_old_dentry) dput(req->r_old_dentry); + if (req->r_old_dentry_dir) { + /* + * track (and drop pins for) r_old_dentry_dir + * separately, since r_old_dentry's d_parent may have + * changed between the dir mutex being dropped and + * this request being freed. + */ + ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); + iput(req->r_old_dentry_dir); } kfree(req->r_path1); kfree(req->r_path2); put_request_session(req); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); kfree(req); } @@ -512,14 +606,19 @@ static void __register_request(struct ceph_mds_client *mdsc, { req->r_tid = ++mdsc->last_tid; if (req->r_num_caps) - ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); + ceph_reserve_caps(mdsc, &req->r_caps_reservation, + req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); spin_lock(&ci->i_unsafe_lock); req->r_unsafe_dir = dir; list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); @@ -540,8 +639,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc, spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); spin_unlock(&ci->i_unsafe_lock); + + iput(req->r_unsafe_dir); + req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -553,6 +657,19 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ +static struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + /* + * we don't need to worry about protecting the d_parent access + * here because we never renaming inside the snapped namespace + * except to resplice to another snapdir, and either the old or new + * result is a valid result. + */ + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -583,14 +700,31 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - if (req->r_dentry->d_inode) { + /* ignore race with rename; old or new d_parent is okay */ + struct dentry *parent = req->r_dentry->d_parent; + struct inode *dir = parent->d_inode; + + if (dir->i_sb != mdsc->fsc->sb) { + /* not this fs! */ inode = req->r_dentry->d_inode; + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = get_nonsnap_parent(parent); + inode = dn->d_inode; + dout("__choose_mds using nonsnap parent %p\n", inode); } else { - inode = req->r_dentry->d_parent->d_inode; - hash = req->r_dentry->d_name.hash; - is_hash = true; + /* dentry target */ + inode = req->r_dentry->d_inode; + if (!inode || mode == USE_AUTH_MDS) { + /* dir + name */ + inode = dir; + hash = ceph_dentry_hash(dir, req->r_dentry); + is_hash = true; + } } } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, (int)hash, mode); if (!inode) @@ -613,9 +747,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (%d/%d)\n", inode, ceph_vinop(inode), - frag.frag, frag.mds, + frag.frag, mds, (int)r, frag.ndist); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } /* since this file/dir wasn't known to be @@ -628,26 +764,28 @@ static int __choose_mds(struct ceph_mds_client *mdsc, dout("choose_mds %p %llx.%llx " "frag %u mds%d (auth)\n", inode, ceph_vinop(inode), frag.frag, mds); - return mds; + if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= + CEPH_MDS_STATE_ACTIVE) + return mds; } } } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap = NULL; if (mode == USE_AUTH_MDS) cap = ci->i_auth_cap; if (!cap && !RB_EMPTY_ROOT(&ci->i_caps)) cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); if (!cap) { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); goto random; } mds = cap->session->s_mds; dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", inode, ceph_vinop(inode), mds, cap == ci->i_auth_cap ? "auth " : "", cap); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return mds; random: @@ -665,7 +803,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, + false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; @@ -704,6 +843,72 @@ static int __open_session(struct ceph_mds_client *mdsc, } /* + * open sessions for any export targets for the given mds + * + * called under mdsc->mutex + */ +static struct ceph_mds_session * +__open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + session = __ceph_lookup_mds_session(mdsc, target); + if (!session) { + session = register_session(mdsc, target); + if (IS_ERR(session)) + return session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || + session->s_state == CEPH_MDS_SESSION_CLOSING) + __open_session(mdsc, session); + + return session; +} + +struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target) +{ + struct ceph_mds_session *session; + + dout("open_export_target_session to mds%d\n", target); + + mutex_lock(&mdsc->mutex); + session = __open_export_target_session(mdsc, target); + mutex_unlock(&mdsc->mutex); + + return session; +} + +static void __open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_info *mi; + struct ceph_mds_session *ts; + int i, mds = session->s_mds; + + if (mds >= mdsc->mdsmap->m_max_mds) + return; + + mi = &mdsc->mdsmap->m_info[mds]; + dout("open_export_target_sessions for mds%d (%d targets)\n", + session->s_mds, mi->num_export_targets); + + for (i = 0; i < mi->num_export_targets; i++) { + ts = __open_export_target_session(mdsc, mi->export_targets[i]); + if (!IS_ERR(ts)) + ceph_put_mds_session(ts); + } +} + +void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&mdsc->mutex); + __open_export_target_sessions(mdsc, session); + mutex_unlock(&mdsc->mutex); +} + +/* * session caps */ @@ -764,7 +969,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, last_inode = NULL; } if (old_cap) { - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); old_cap = NULL; } @@ -793,7 +998,7 @@ out: if (last_inode) iput(last_inode); if (old_cap) - ceph_put_cap(old_cap); + ceph_put_cap(session->s_mdsc, old_cap); return ret; } @@ -806,11 +1011,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - spin_lock(&inode->i_lock); - __ceph_remove_cap(cap); + spin_lock(&ci->i_ceph_lock); + __ceph_remove_cap(cap, false); if (!__ceph_is_any_real_caps(ci)) { struct ceph_mds_client *mdsc = - &ceph_sb_to_client(inode->i_sb)->mdsc; + ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) { @@ -839,7 +1044,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); while (drop--) iput(inode); return 0; @@ -852,6 +1057,37 @@ static void remove_session_caps(struct ceph_mds_session *session) { dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); + + spin_lock(&session->s_cap_lock); + if (session->s_nr_caps > 0) { + struct super_block *sb = session->s_mdsc->fsc->sb; + struct inode *inode; + struct ceph_cap *cap, *prev = NULL; + struct ceph_vino vino; + /* + * iterate_session_caps() skips inodes that are being + * deleted, we need to wait until deletions are complete. + * __wait_on_freeing_inode() is designed for the job, + * but it is not exported, so use lookup inode function + * to access it. + */ + while (!list_empty(&session->s_caps)) { + cap = list_entry(session->s_caps.next, + struct ceph_cap, session_caps); + if (cap == prev) + break; + prev = cap; + vino = cap->ci->i_vino; + spin_unlock(&session->s_cap_lock); + + inode = ceph_find_inode(sb, vino); + iput(inode); + + spin_lock(&session->s_cap_lock); + } + } + spin_unlock(&session->s_cap_lock); + BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); @@ -868,12 +1104,12 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up(&ci->i_cap_wq); + wake_up_all(&ci->i_cap_wq); if (arg) { - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } return 0; } @@ -922,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -934,8 +1185,7 @@ static void renewed_caps(struct ceph_mds_client *mdsc, int wake = 0; spin_lock(&session->s_cap_lock); - was_stale = is_renew && (session->s_cap_ttl == 0 || - time_after_eq(jiffies, session->s_cap_ttl)); + was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl); session->s_cap_ttl = session->s_renew_requested + mdsc->mdsmap->m_session_timeout*HZ; @@ -1001,31 +1251,36 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_mds_session *session = arg; struct ceph_inode_info *ci = ceph_inode(inode); - int used, oissued, mine; + int used, wanted, oissued, mine; if (session->s_trim_caps <= 0) return -1; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); mine = cap->issued | cap->implemented; used = __ceph_caps_used(ci); + wanted = __ceph_caps_file_wanted(ci); oissued = __ceph_caps_issued_other(ci, cap); - dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n", + dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n", inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), - ceph_cap_string(used)); - if (ci->i_dirty_caps) - goto out; /* dirty caps */ - if ((used & ~oissued) & mine) + ceph_cap_string(used), ceph_cap_string(wanted)); + if (cap == ci->i_auth_cap) { + if (ci->i_dirty_caps | ci->i_flushing_caps) + goto out; + if ((used | wanted) & CEPH_CAP_ANY_WR) + goto out; + } + if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ - __ceph_remove_cap(cap); + __ceph_remove_cap(cap, true); } else { /* try to drop referring dentries */ - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); d_prune_aliases(inode); dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, atomic_read(&inode->i_count)); @@ -1033,7 +1288,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return 0; } @@ -1056,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc, trim_caps - session->s_trim_caps); session->s_trim_caps = 0; } + + ceph_add_cap_releases(mdsc, session); + ceph_send_cap_releases(mdsc, session); return 0; } @@ -1067,15 +1325,16 @@ static int trim_caps(struct ceph_mds_client *mdsc, * Called under s_mutex. */ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra) + struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg, *partial = NULL; struct ceph_mds_cap_release *head; int err = -ENOMEM; + int extra = mdsc->fsc->mount_options->cap_release_safety; + int num; - if (extra < 0) - extra = mdsc->client->mount_args->cap_release_safety; + dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, + extra); spin_lock(&session->s_cap_lock); @@ -1084,13 +1343,18 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg, list_head); head = msg->front.iov_base; - extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); + num = le32_to_cpu(head->num); + if (num) { + dout(" partial %p with (%d/%d)\n", msg, num, + (int)CEPH_CAPS_PER_RELEASE); + extra += CEPH_CAPS_PER_RELEASE - num; + partial = msg; + } } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS); + GFP_NOFS, false); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1103,19 +1367,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc, session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; } - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - if (head->num) { - dout(" queueing non-full %p (%d)\n", msg, - le32_to_cpu(head->num)); - list_move_tail(&msg->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= - CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); - } + if (partial) { + head = partial->front.iov_base; + num = le32_to_cpu(head->num); + dout(" queueing partial %p with %d/%d\n", partial, num, + (int)CEPH_CAPS_PER_RELEASE); + list_move_tail(&partial->list_head, + &session->s_cap_releases_done); + session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; } err = 0; spin_unlock(&session->s_cap_lock); @@ -1150,7 +1409,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) i_flushing_item); struct inode *inode = &ci->vfs_inode; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_cap_flush_seq <= want_flush_seq) { dout("check_cap_flush still flushing %p " "seq %lld <= %lld to mds%d\n", inode, @@ -1158,7 +1417,7 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) session->s_mds); ret = 0; } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); @@ -1204,16 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, unsigned num; dout("discard_cap_releases mds%d\n", session->s_mds); - spin_lock(&session->s_cap_lock); - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); - head->num = cpu_to_le32(0); - session->s_num_cap_releases += num; + if (!list_empty(&session->s_cap_releases)) { + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", + session->s_mds, msg, num); + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + session->s_num_cap_releases += num; + } /* requeue completed messages */ while (!list_empty(&session->s_cap_releases_done)) { @@ -1230,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc, msg->front.iov_len = sizeof(*head); list_add(&msg->list_head, &session->s_cap_releases); } - - spin_unlock(&session->s_cap_lock); } /* * requests */ +int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; + size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + + sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + int order, num_entries; + + spin_lock(&ci->i_ceph_lock); + num_entries = ci->i_files + ci->i_subdirs; + spin_unlock(&ci->i_ceph_lock); + num_entries = max(num_entries, 1); + num_entries = min(num_entries, opt->max_readdir); + + order = get_order(size * num_entries); + while (order >= 0) { + rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + order); + if (rinfo->dir_in) + break; + order--; + } + if (!rinfo->dir_in) + return -ENOMEM; + + num_entries = (PAGE_SIZE << order) / size; + num_entries = min(num_entries, opt->max_readdir); + + rinfo->dir_buf_size = PAGE_SIZE << order; + req->r_num_caps = num_entries + 1; + req->r_args.readdir.max_entries = cpu_to_le32(num_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes); + return 0; +} + /* * Create an mds request. */ @@ -1250,6 +1547,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) return ERR_PTR(-ENOMEM); mutex_init(&req->r_fill_mutex); + req->r_mdsc = mdsc; req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1260,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); + req->r_stamp = CURRENT_TIME; + req->r_op = op; req->r_direct_mode = mode; return req; @@ -1303,12 +1603,15 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, struct dentry *temp; char *path; int len, pos; + unsigned seq; if (dentry == NULL) return ERR_PTR(-EINVAL); retry: len = 0; + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) @@ -1319,11 +1622,8 @@ retry: else len += 1 + temp->d_name.len; temp = temp->d_parent; - if (temp == NULL) { - pr_err("build_path corrupt dentry %p\n", dentry); - return ERR_PTR(-EINVAL); - } } + rcu_read_unlock(); if (len) len--; /* no leading '/' */ @@ -1332,32 +1632,35 @@ retry: return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ + rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) { - struct inode *inode = temp->d_inode; + struct inode *inode; + spin_lock(&temp->d_lock); + inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { + spin_unlock(&temp->d_lock); break; } else { pos -= temp->d_name.len; - if (pos < 0) + if (pos < 0) { + spin_unlock(&temp->d_lock); break; + } strncpy(path + pos, temp->d_name.name, temp->d_name.len); } + spin_unlock(&temp->d_lock); if (pos) path[--pos] = '/'; temp = temp->d_parent; - if (temp == NULL) { - pr_err("build_path corrupt dentry\n"); - kfree(path); - return ERR_PTR(-EINVAL); - } } - if (pos != 0) { + rcu_read_unlock(); + if (pos != 0 || read_seqretry(&rename_lock, seq)) { pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a @@ -1371,7 +1674,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", - dentry, atomic_read(&dentry->d_count), *base, len, path); + dentry, d_count(dentry), *base, len, path); return path; } @@ -1436,10 +1739,10 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); - } else if (rpath) { + } else if (rpath || rino) { *ino = rino; *ppath = rpath; - *pathlen = strlen(rpath); + *pathlen = rpath ? strlen(rpath) : 0; dout(" path %.*s\n", *pathlen, rpath); } @@ -1482,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, } len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + sizeof(struct timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * @@ -1493,12 +1797,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); goto out_free2; } + msg->hdr.version = 2; msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -1507,8 +1812,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -1535,12 +1840,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); head->num_releases = cpu_to_le16(releases); + /* time stamp */ + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - msg->pages = req->r_pages; - msg->nr_pages = req->r_num_pages; + if (req->r_data_len) { + /* outbound data set only by ceph_sync_setxattr() */ + BUG_ON(!req->r_pages); + ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + } + msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); @@ -1564,7 +1876,7 @@ static void complete_request(struct ceph_mds_client *mdsc, if (req->r_callback) req->r_callback(mdsc, req); else - complete(&req->r_completion); + complete_all(&req->r_completion); } /* @@ -1578,8 +1890,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int flags = 0; - req->r_mds = mds; req->r_attempts++; + if (req->r_inode) { + struct ceph_cap *cap = + ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds); + + if (cap) + req->r_sent_on_mseq = cap->mseq; + else + req->r_sent_on_mseq = -1; + } dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); @@ -1646,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { @@ -1656,6 +1979,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } + put_request_session(req); + mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { @@ -1673,6 +1998,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } } + req->r_session = get_session(session); + dout("do_request mds%d session %p state %s\n", mds, session, session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && @@ -1685,7 +2012,6 @@ static int __do_request(struct ceph_mds_client *mdsc, } /* send request */ - req->r_session = get_session(session); req->r_resend_mds = -1; /* forget any previous mds hint */ if (req->r_request_started == 0) /* note request start time */ @@ -1714,10 +2040,16 @@ finish: static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head) { - struct ceph_mds_request *req, *nreq; + struct ceph_mds_request *req; + LIST_HEAD(tmp_list); - list_for_each_entry_safe(req, nreq, head, r_wait) { + list_splice_init(head, &tmp_list); + + while (!list_empty(&tmp_list)) { + req = list_entry(tmp_list.next, + struct ceph_mds_request, r_wait); list_del_init(&req->r_wait); + dout(" wake request %p tid %llu\n", req, req->r_tid); __do_request(mdsc, req); } } @@ -1739,7 +2071,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); - put_request_session(req); __do_request(mdsc, req); } } @@ -1772,10 +2103,9 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_locked_dir) ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_old_dentry) - ceph_get_cap_refs( - ceph_inode(req->r_old_dentry->d_parent->d_inode), - CEPH_CAP_PIN); + if (req->r_old_dentry_dir) + ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), + CEPH_CAP_PIN); /* issue */ mutex_lock(&mdsc->mutex); @@ -1833,20 +2163,16 @@ out: } /* - * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * Invalidate dir's completeness, dentry lease state on an aborted MDS * namespace request. */ void ceph_invalidate_dir_request(struct ceph_mds_request *req) { struct inode *inode = req->r_locked_dir; - struct ceph_inode_info *ci = ceph_inode(inode); - dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); - spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - spin_unlock(&inode->i_lock); + dout("invalidate_dir_request %p (complete, lease(s))\n", inode); + ceph_dir_clear_complete(inode); if (req->r_dentry) ceph_invalidate_dentry_lease(req->r_dentry); if (req->r_old_dentry) @@ -1899,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* dup? */ if ((req->r_got_unsafe && !head->safe) || (req->r_got_safe && head->safe)) { - pr_warning("got a dup %s reply on %llu from mds%d\n", + pr_warn("got a dup %s reply on %llu from mds%d\n", head->safe ? "safe" : "unsafe", tid, mds); mutex_unlock(&mdsc->mutex); goto out; } if (req->r_got_safe && !head->safe) { - pr_warning("got unsafe after safe on %llu from mds%d\n", + pr_warn("got unsafe after safe on %llu from mds%d\n", tid, mds); mutex_unlock(&mdsc->mutex); goto out; @@ -1914,25 +2240,36 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); /* - * Tolerate 2 consecutive ESTALEs from the same mds. - * FIXME: we should be looking at the cap migrate_seq. + * Handle an ESTALE + * if we're not talking to the authority, send to them + * if the authority has changed while we weren't looking, + * send to new authority + * Otherwise we just have to return an ESTALE */ if (result == -ESTALE) { - req->r_direct_mode = USE_AUTH_MDS; - req->r_num_stale++; - if (req->r_num_stale <= 2) { + dout("got ESTALE on request %llu", req->r_tid); + if (req->r_direct_mode != USE_AUTH_MDS) { + dout("not using auth, setting for that now"); + req->r_direct_mode = USE_AUTH_MDS; __do_request(mdsc, req); mutex_unlock(&mdsc->mutex); goto out; + } else { + int mds = __choose_mds(mdsc, req); + if (mds >= 0 && mds != req->r_session->s_mds) { + dout("but auth changed, so resending"); + __do_request(mdsc, req); + mutex_unlock(&mdsc->mutex); + goto out; + } } - } else { - req->r_num_stale = 0; + dout("have to return ESTALE on request %llu", req->r_tid); } + if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete(&req->r_safe_completion); if (req->r_got_unsafe) { /* @@ -1947,7 +2284,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* last unsafe request during umount? */ if (mdsc->stopping && !__get_oldest_req(mdsc)) - complete(&mdsc->safe_umount_waiters); + complete_all(&mdsc->safe_umount_waiters); mutex_unlock(&mdsc->mutex); goto out; } @@ -1958,12 +2295,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo); + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } @@ -1981,11 +2318,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* insert trace into our cache */ mutex_lock(&req->r_fill_mutex); - err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); + err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && rinfo->dir_nr) + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(&req->r_caps_reservation); + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } mutex_unlock(&req->r_fill_mutex); @@ -2005,7 +2343,7 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session, -1); + ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2126,17 +2464,17 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete(&mdsc->session_close_waiters); + wake_up_all(&mdsc->session_close_wq); kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_cap_lock); + spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; - session->s_cap_ttl = 0; - spin_unlock(&session->s_cap_lock); + session->s_cap_ttl = jiffies - 1; + spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; @@ -2144,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2193,9 +2535,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_cap_reconnect rec; + union { + struct ceph_mds_cap_reconnect v2; + struct ceph_mds_cap_reconnect_v1 v1; + } rec; + size_t reclen; struct ceph_inode_info *ci; - struct ceph_pagelist *pagelist = arg; + struct ceph_reconnect_state *recon_state = arg; + struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; int pathlen, err; u64 pathbase; @@ -2215,7 +2562,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - BUG_ON(err); + goto out_dput; } } else { path = NULL; @@ -2223,25 +2570,80 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } err = ceph_pagelist_encode_string(pagelist, path, pathlen); if (err) - goto out; + goto out_free; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ - rec.cap_id = cpu_to_le64(cap->cap_id); - rec.pathbase = cpu_to_le64(pathbase); - rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); - rec.issued = cpu_to_le32(cap->issued); - rec.size = cpu_to_le64(inode->i_size); - ceph_encode_timespec(&rec.mtime, &inode->i_mtime); - ceph_encode_timespec(&rec.atime, &inode->i_atime); - rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - spin_unlock(&inode->i_lock); - - err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); + cap->mseq = 0; /* and migrate_seq */ + cap->cap_gen = cap->session->s_cap_gen; + + if (recon_state->flock) { + rec.v2.cap_id = cpu_to_le64(cap->cap_id); + rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v2.issued = cpu_to_le32(cap->issued); + rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v2.pathbase = cpu_to_le64(pathbase); + rec.v2.flock_len = 0; + reclen = sizeof(rec.v2); + } else { + rec.v1.cap_id = cpu_to_le64(cap->cap_id); + rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); + rec.v1.issued = cpu_to_le32(cap->issued); + rec.v1.size = cpu_to_le64(inode->i_size); + ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec(&rec.v1.atime, &inode->i_atime); + rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); + rec.v1.pathbase = cpu_to_le64(pathbase); + reclen = sizeof(rec.v1); + } + spin_unlock(&ci->i_ceph_lock); + + if (recon_state->flock) { + int num_fcntl_locks, num_flock_locks; + struct ceph_filelock *flocks; + +encode_again: + spin_lock(&inode->i_lock); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + spin_unlock(&inode->i_lock); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + spin_lock(&inode->i_lock); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + spin_unlock(&inode->i_lock); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); + } else { + err = ceph_pagelist_append(pagelist, &rec, reclen); + } -out: + recon_state->nr_caps++; +out_free: kfree(path); +out_dput: dput(dentry); return err; } @@ -2266,7 +2668,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; + int s_nr_caps; struct ceph_pagelist *pagelist; + struct ceph_reconnect_state recon_state; pr_info("mds%d reconnect start\n", mds); @@ -2275,7 +2679,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); if (!reply) goto fail_nomsg; @@ -2283,7 +2687,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; + ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); /* replay unsafe requests */ @@ -2294,17 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, dout("session %p state %s\n", session, session_state_name(session->s_state)); + spin_lock(&session->s_gen_ttl_lock); + session->s_cap_gen++; + spin_unlock(&session->s_gen_ttl_lock); + + spin_lock(&session->s_cap_lock); + /* + * notify __ceph_remove_cap() that we are composing cap reconnect. + * If a cap get released before being added to the cap reconnect, + * __ceph_remove_cap() should skip queuing cap release. + */ + session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ discard_cap_releases(mdsc, session); + spin_unlock(&session->s_cap_lock); /* traverse this session's caps */ - err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); + s_nr_caps = session->s_nr_caps; + err = ceph_pagelist_encode_32(pagelist, s_nr_caps); if (err) goto fail; - err = iterate_session_caps(session, encode_caps_cb, pagelist); + + recon_state.nr_caps = 0; + recon_state.pagelist = pagelist; + recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; + err = iterate_session_caps(session, encode_caps_cb, &recon_state); if (err < 0) goto fail; + spin_lock(&session->s_cap_lock); + session->s_cap_reconnect = 0; + spin_unlock(&session->s_cap_lock); + /* * snaprealms. we provide mds with the ino, seq (version), and * parent for all of our realms. If the mds has any newer info, @@ -2325,9 +2752,20 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, goto fail; } - reply->pagelist = pagelist; + if (recon_state.flock) + reply->hdr.version = cpu_to_le16(2); + + /* raced with cap release? */ + if (s_nr_caps != recon_state.nr_caps) { + struct page *page = list_first_entry(&pagelist->head, + struct page, lru); + __le32 *addr = kmap_atomic(page); + *addr = cpu_to_le32(recon_state.nr_caps); + kunmap_atomic(addr); + } + reply->hdr.data_len = cpu_to_le32(pagelist->length); - reply->nr_pages = calc_pages_for(0, pagelist->length); + ceph_msg_data_add_pagelist(reply, pagelist); ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -2376,12 +2814,15 @@ static void check_new_map(struct ceph_mds_client *mdsc, oldstate = ceph_mdsmap_get_state(oldmap, i); newstate = ceph_mdsmap_get_state(newmap, i); - dout("check_new_map mds%d state %s -> %s (session %s)\n", + dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n", i, ceph_mds_state_name(oldstate), + ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), + ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", session_state_name(s->s_state)); - if (memcmp(ceph_mdsmap_get_addr(oldmap, i), + if (i >= newmap->m_max_mds || + memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { @@ -2428,6 +2869,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, wake_up_session_caps(s, 1); } } + + for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + s = mdsc->sessions[i]; + if (!s) + continue; + if (!ceph_mdsmap_is_laggy(newmap, i)) + continue; + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG || + s->s_state == CEPH_MDS_SESSION_CLOSING) { + dout(" connecting to export targets of laggy mds%d\n", + i); + __open_export_target_sessions(mdsc, s); + } + } } @@ -2451,16 +2907,14 @@ static void handle_lease(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; struct inode *inode; - struct ceph_inode_info *ci; struct dentry *parent, *dentry; struct ceph_dentry_info *di; int mds = session->s_mds; struct ceph_mds_lease *h = msg->front.iov_base; u32 seq; struct ceph_vino vino; - int mask; struct qstr dname; int release = 0; @@ -2471,7 +2925,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, goto bad; vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; - mask = le16_to_cpu(h->mask); seq = le32_to_cpu(h->seq); dname.name = (void *)h + sizeof(*h) + sizeof(u32); dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); @@ -2483,14 +2936,13 @@ static void handle_lease(struct ceph_mds_client *mdsc, /* lookup inode */ inode = ceph_find_inode(sb, vino); - dout("handle_lease %s, mask %d, ino %llx %p %.*s\n", - ceph_lease_op_name(h->action), mask, vino.ino, inode, + dout("handle_lease %s, ino %llx %p %.*s\n", + ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); goto release; } - ci = ceph_inode(inode); /* dentry */ parent = d_find_alias(inode); @@ -2509,7 +2961,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di = ceph_dentry(dentry); switch (h->action) { case CEPH_MDS_LEASE_REVOKE: - if (di && di->lease_session == session) { + if (di->lease_session == session) { if (ceph_seq_cmp(di->lease_seq, seq) > 0) h->seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); @@ -2518,7 +2970,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, break; case CEPH_MDS_LEASE_RENEW: - if (di && di->lease_session == session && + if (di->lease_session == session && di->lease_gen == session->s_cap_gen && di->lease_renew_from && di->lease_renew_after == 0) { @@ -2570,12 +3022,11 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); if (!msg) return; lease = msg->front.iov_base; lease->action = action; - lease->mask = cpu_to_le16(1); lease->ino = cpu_to_le64(ceph_vino(inode).ino); lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); lease->seq = cpu_to_le32(seq); @@ -2597,7 +3048,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, * Pass @inode always, @dentry is optional. */ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dentry, int mask) + struct dentry *dentry) { struct ceph_dentry_info *di; struct ceph_mds_session *session; @@ -2605,7 +3056,6 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, BUG_ON(inode == NULL); BUG_ON(dentry == NULL); - BUG_ON(mask == 0); /* is dentry lease valid? */ spin_lock(&dentry->d_lock); @@ -2615,8 +3065,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, di->lease_gen != di->lease_session->s_cap_gen || !time_before(jiffies, dentry->d_time)) { dout("lease_release inode %p dentry %p -- " - "no lease on %d\n", - inode, dentry, mask); + "no lease\n", + inode, dentry); spin_unlock(&dentry->d_lock); return; } @@ -2627,8 +3077,8 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); - dout("lease_release inode %p dentry %p mask %d to mds%d\n", - inode, dentry, mask, session->s_mds); + dout("lease_release inode %p dentry %p to mds%d\n", + inode, dentry, session->s_mds); ceph_mdsc_lease_send_msg(session, inode, dentry, CEPH_MDS_LEASE_RELEASE, seq); ceph_put_mds_session(session); @@ -2715,7 +3165,7 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s, -1); + ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -2729,17 +3179,25 @@ static void delayed_work(struct work_struct *work) schedule_delayed(mdsc); } +int ceph_mdsc_init(struct ceph_fs_client *fsc) -int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) { - mdsc->client = client; + struct ceph_mds_client *mdsc; + + mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS); + if (!mdsc) + return -ENOMEM; + mdsc->fsc = fsc; + fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); - init_completion(&mdsc->session_close_waiters); + init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; mdsc->max_sessions = 0; @@ -2758,12 +3216,16 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) spin_lock_init(&mdsc->snap_flush_lock); mdsc->cap_flush_seq = 0; INIT_LIST_HEAD(&mdsc->cap_dirty); + INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + ceph_caps_init(mdsc); + ceph_adjust_min_caps(mdsc, fsc->min_caps); + return 0; } @@ -2774,7 +3236,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) static void wait_requests(struct ceph_mds_client *mdsc) { struct ceph_mds_request *req; - struct ceph_client *client = mdsc->client; + struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -2782,7 +3244,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - client->mount_args->mount_timeout * HZ); + fsc->client->options->mount_timeout * HZ); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -2865,7 +3327,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -2881,6 +3343,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); } +/* + * true if all sessions are closed, or we force unmount + */ +static bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + int i, n = 0; + + if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) + if (mdsc->sessions[i]) + n++; + mutex_unlock(&mdsc->mutex); + return n == 0; +} /* * called after sb is ro. @@ -2889,45 +3368,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - int n; - struct ceph_client *client = mdsc->client; - unsigned long started, timeout = client->mount_args->mount_timeout * HZ; + struct ceph_fs_client *fsc = mdsc->fsc; + unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); - mutex_lock(&mdsc->mutex); - /* close sessions */ - started = jiffies; - while (time_before(jiffies, started + timeout)) { - dout("closing sessions\n"); - n = 0; - for (i = 0; i < mdsc->max_sessions; i++) { - session = __ceph_lookup_mds_session(mdsc, i); - if (!session) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&session->s_mutex); - __close_session(mdsc, session); - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - mutex_lock(&mdsc->mutex); - n++; - } - if (n == 0) - break; - - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) - break; - - dout("waiting for sessions to close\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; mutex_unlock(&mdsc->mutex); - wait_for_completion_timeout(&mdsc->session_close_waiters, - timeout); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = get_session(mdsc->sessions[i]); @@ -2940,9 +3406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_lock(&mdsc->mutex); } } - WARN_ON(!list_empty(&mdsc->cap_delay_list)); - mutex_unlock(&mdsc->mutex); ceph_cleanup_empty_realms(mdsc); @@ -2952,13 +3416,29 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("stopped\n"); } -void ceph_mdsc_stop(struct ceph_mds_client *mdsc) +static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) { dout("stop\n"); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ if (mdsc->mdsmap) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); + ceph_caps_finalize(mdsc); +} + +void ceph_mdsc_destroy(struct ceph_fs_client *fsc) +{ + struct ceph_mds_client *mdsc = fsc->mdsc; + + dout("mdsc_destroy %p\n", mdsc); + ceph_mdsc_stop(mdsc); + + /* flush out any connection work with references to us */ + ceph_msgr_flush(); + + fsc->mdsc = NULL; + kfree(mdsc); + dout("mdsc_destroy %p done\n", mdsc); } @@ -2977,14 +3457,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_check_fsid(mdsc->client, &fsid) < 0) + if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0) return; epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); dout("handle_map epoch %u len %d\n", epoch, (int)maplen); /* do we need it? */ - ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); + ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); mutex_lock(&mdsc->mutex); if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { dout("handle_map epoch %u <= our %u\n", @@ -3008,7 +3488,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; __wake_requests(mdsc, &mdsc->waiting_for_map); @@ -3039,8 +3519,8 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); ceph_put_mds_session(s); - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref)); } /* @@ -3052,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - pr_warning("mds%d closed our session\n", s->s_mds); + pr_warn("mds%d closed our session\n", s->s_mds); send_mds_reconnect(mdsc, s); } @@ -3103,39 +3583,37 @@ out: /* * authentication */ -static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) + +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; - int ret = 0; - - if (force_new && s->s_authorizer) { - ac->ops->destroy_authorizer(ac, s->s_authorizer); - s->s_authorizer = NULL; - } - if (s->s_authorizer == NULL) { - if (ac->ops->create_authorizer) { - ret = ac->ops->create_authorizer( - ac, CEPH_ENTITY_TYPE_MDS, - &s->s_authorizer, - &s->s_authorizer_buf, - &s->s_authorizer_buf_len, - &s->s_authorizer_reply_buf, - &s->s_authorizer_reply_buf_len); - if (ret) - return ret; - } + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, + auth); + if (ret) + return ERR_PTR(ret); } - *proto = ac->protocol; - *buf = s->s_authorizer_buf; - *len = s->s_authorizer_buf_len; - *reply_buf = s->s_authorizer_reply_buf; - *reply_len = s->s_authorizer_reply_buf_len; - return 0; + + return auth; } @@ -3143,21 +3621,41 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; - struct ceph_auth_client *ac = mdsc->client->monc.auth; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); - return ceph_monc_validate_auth(&mdsc->client->monc); + return ceph_monc_validate_auth(&mdsc->fsc->client->monc); +} + +static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) +{ + struct ceph_msg *msg; + int type = (int) le16_to_cpu(hdr->type); + int front_len = (int) le32_to_cpu(hdr->front_len); + + if (con->in_msg) + return con->in_msg; + + *skip = 0; + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!msg) { + pr_err("unable to allocate msg type %d len %d\n", + type, front_len); + return NULL; + } + + return msg; } static const struct ceph_connection_operations mds_con_ops = { @@ -3168,9 +3666,7 @@ static const struct ceph_connection_operations mds_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, + .alloc_msg = mds_alloc_msg, }; - - - /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 952410c60d0..e00737cf523 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -8,9 +8,10 @@ #include <linux/rbtree.h> #include <linux/spinlock.h> -#include "types.h" -#include "messenger.h" -#include "mdsmap.h" +#include <linux/ceph/types.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/auth.h> /* * Some lock dependencies: @@ -20,13 +21,13 @@ * * mdsc->snap_rwsem * - * inode->i_lock + * ci->i_ceph_lock * mdsc->snap_flush_lock * mdsc->cap_delay_lock * */ -struct ceph_client; +struct ceph_fs_client; struct ceph_cap; /* @@ -35,6 +36,7 @@ struct ceph_cap; */ struct ceph_mds_reply_info_in { struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; u32 symlink_len; char *symlink; u32 xattr_len; @@ -42,26 +44,44 @@ struct ceph_mds_reply_info_in { }; /* - * parsed info about an mds reply, including information about the - * target inode and/or its parent directory and dentry, and directory - * contents (for readdir results). + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; + /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; - struct ceph_mds_reply_dirfrag *dir_dir; - int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + size_t dir_buf_size; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; + }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ @@ -101,17 +121,19 @@ struct ceph_mds_session { struct ceph_connection s_con; - struct ceph_authorizer *s_authorizer; - void *s_authorizer_buf, *s_authorizer_reply_buf; - size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; + struct ceph_auth_handshake s_auth; - /* protected by s_cap_lock */ - spinlock_t s_cap_lock; + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; + int s_cap_reconnect; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; @@ -151,14 +173,15 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request { u64 r_tid; /* transaction id */ struct rb_node r_node; + struct ceph_mds_client *r_mdsc; int r_op; /* mds op code */ - int r_mds; /* operation on what? */ struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ + struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; @@ -169,6 +192,9 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + kuid_t r_uid; + kgid_t r_gid; + struct timespec r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; @@ -207,8 +233,8 @@ struct ceph_mds_request { int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ - int r_num_stale; int r_resend_mds; /* mds to resend to next, if any*/ + u32 r_sent_on_mseq; /* cap mseq request was sent at*/ struct kref r_kref; struct list_head r_wait; @@ -229,11 +255,12 @@ struct ceph_mds_request { * mds client state */ struct ceph_mds_client { - struct ceph_client *client; + struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; - struct completion safe_umount_waiters, session_close_waiters; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ @@ -263,14 +290,31 @@ struct ceph_mds_client { u64 cap_flush_seq; struct list_head cap_dirty; /* inodes with dirty caps */ + struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif - + /* + * Cap reservations + * + * Maintain a global pool of preallocated struct ceph_caps, referenced + * by struct ceph_caps_reservations. This ensures that we preallocate + * memory needed to successfully process an MDS response. (If an MDS + * sends us cap information and we fail to process it, we will have + * problems due to the client and MDS being out of sync.) + * + * Reservations are 'owned' by a ceph_cap_reservation context. + */ + spinlock_t caps_list_lock; + struct list_head caps_list; /* unused (reserved or + unreserved) */ + int caps_total_count; /* total caps allocated */ + int caps_use_count; /* in use */ + int caps_reserve_count; /* unused, reserved */ + int caps_avail_count; /* unused, unreserved */ + int caps_min_count; /* keep at least this many + (unreserved) */ spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; @@ -293,19 +337,19 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, struct ceph_msg *msg, int mds); -extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_client *client); +extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); -extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); +extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, - struct dentry *dn, int mask); + struct dentry *dn); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); - +extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, + struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, @@ -324,8 +368,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) } extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - int extra); + struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); @@ -343,4 +386,9 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg); +extern struct ceph_mds_session * +ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); +extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); + #endif diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index c4c498e6dfe..261531e55e9 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -1,4 +1,4 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/bug.h> #include <linux/err.h> @@ -6,9 +6,9 @@ #include <linux/slab.h> #include <linux/types.h> -#include "mdsmap.h" -#include "messenger.h" -#include "decode.h" +#include <linux/ceph/mdsmap.h> +#include <linux/ceph/messenger.h> +#include <linux/ceph/decode.h> #include "super.h" @@ -20,7 +20,10 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; int i; - char r; + + /* special case for one mds */ + if (1 == m->m_max_mds && m->m_info[0].state > 0) + return 0; /* count */ for (i = 0; i < m->m_max_mds; i++) @@ -30,8 +33,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return -1; /* pick */ - get_random_bytes(&r, 1); - n = r % n; + n = prandom_u32() % n; i = 0; for (i = 0; n > 0; i++, n--) while (m->m_info[i].state <= 0) @@ -59,6 +61,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) return ERR_PTR(-ENOMEM); ceph_decode_16_safe(p, end, version, bad); + if (version > 3) { + pr_warn("got mdsmap version %d > 3, failing", version); + goto bad; + } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); m->m_epoch = ceph_decode_32(p); @@ -85,6 +91,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; + struct ceph_timespec laggy_since; + struct ceph_mds_info *info; ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); global_id = ceph_decode_64(p); @@ -103,7 +111,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) state_seq = ceph_decode_64(p); ceph_decode_copy(p, &addr, sizeof(addr)); ceph_decode_addr(&addr); - *p += sizeof(struct ceph_timespec); + ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -116,36 +124,43 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", - i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), + i+1, n, global_id, mds, inc, + ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds >= 0 && mds < m->m_max_mds && state > 0) { - m->m_info[mds].global_id = global_id; - m->m_info[mds].state = state; - m->m_info[mds].addr = addr; - m->m_info[mds].num_export_targets = num_export_targets; - if (num_export_targets) { - m->m_info[mds].export_targets = - kcalloc(num_export_targets, sizeof(u32), - GFP_NOFS); - for (j = 0; j < num_export_targets; j++) - m->m_info[mds].export_targets[j] = - ceph_decode_32(&pexport_targets); - } else { - m->m_info[mds].export_targets = NULL; - } + + if (mds < 0 || mds >= m->m_max_mds || state <= 0) + continue; + + info = &m->m_info[mds]; + info->global_id = global_id; + info->state = state; + info->addr = addr; + info->laggy = (laggy_since.tv_sec != 0 || + laggy_since.tv_nsec != 0); + info->num_export_targets = num_export_targets; + if (num_export_targets) { + info->export_targets = kcalloc(num_export_targets, + sizeof(u32), GFP_NOFS); + if (info->export_targets == NULL) + goto badmem; + for (j = 0; j < num_export_targets; j++) + info->export_targets[j] = + ceph_decode_32(&pexport_targets); + } else { + info->export_targets = NULL; } } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); m->m_num_data_pg_pools = n; - m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); + m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) goto badmem; - ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); + ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) - m->m_data_pg_pools[i] = ceph_decode_32(p); - m->m_cas_pg_pool = ceph_decode_32(p); + m->m_data_pg_pools[i] = ceph_decode_64(p); + m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ dout("mdsmap_decode success epoch %u\n", m->m_epoch); @@ -159,7 +174,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h deleted file mode 100644 index eacc131aa5c..00000000000 --- a/fs/ceph/mdsmap.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _FS_CEPH_MDSMAP_H -#define _FS_CEPH_MDSMAP_H - -#include "types.h" - -/* - * mds map - describe servers in the mds cluster. - * - * we limit fields to those the client actually xcares about - */ -struct ceph_mds_info { - u64 global_id; - struct ceph_entity_addr addr; - s32 state; - int num_export_targets; - u32 *export_targets; -}; - -struct ceph_mdsmap { - u32 m_epoch, m_client_epoch, m_last_failure; - u32 m_root; - u32 m_session_timeout; /* seconds */ - u32 m_session_autoclose; /* seconds */ - u64 m_max_file_size; - u32 m_max_mds; /* size of m_addr, m_state arrays */ - struct ceph_mds_info *m_info; - - /* which object pools file data can be stored in */ - int m_num_data_pg_pools; - u32 *m_data_pg_pools; - u32 m_cas_pg_pool; -}; - -static inline struct ceph_entity_addr * -ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) -{ - if (w >= m->m_max_mds) - return NULL; - return &m->m_info[w].addr; -} - -static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) -{ - BUG_ON(w < 0); - if (w >= m->m_max_mds) - return CEPH_MDS_STATE_DNE; - return m->m_info[w].state; -} - -extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); -extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); -extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); - -#endif diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c deleted file mode 100644 index 15167b2daa5..00000000000 --- a/fs/ceph/messenger.c +++ /dev/null @@ -1,2276 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/crc32c.h> -#include <linux/ctype.h> -#include <linux/highmem.h> -#include <linux/inet.h> -#include <linux/kthread.h> -#include <linux/net.h> -#include <linux/slab.h> -#include <linux/socket.h> -#include <linux/string.h> -#include <net/tcp.h> - -#include "super.h" -#include "messenger.h" -#include "decode.h" -#include "pagelist.h" - -/* - * Ceph uses the messenger to exchange ceph_msg messages with other - * hosts in the system. The messenger provides ordered and reliable - * delivery. We tolerate TCP disconnects by reconnecting (with - * exponential backoff) in the case of a fault (disconnection, bad - * crc, protocol error). Acks allow sent messages to be discarded by - * the sender. - */ - -/* static tag bytes (protocol control messages) */ -static char tag_msg = CEPH_MSGR_TAG_MSG; -static char tag_ack = CEPH_MSGR_TAG_ACK; -static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; - -#ifdef CONFIG_LOCKDEP -static struct lock_class_key socket_class; -#endif - - -static void queue_con(struct ceph_connection *con); -static void con_work(struct work_struct *); -static void ceph_fault(struct ceph_connection *con); - -/* - * nicely render a sockaddr as a string. - */ -#define MAX_ADDR_STR 20 -#define MAX_ADDR_STR_LEN 60 -static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; -static DEFINE_SPINLOCK(addr_str_lock); -static int last_addr_str; - -const char *pr_addr(const struct sockaddr_storage *ss) -{ - int i; - char *s; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; - - spin_lock(&addr_str_lock); - i = last_addr_str++; - if (last_addr_str == MAX_ADDR_STR) - last_addr_str = 0; - spin_unlock(&addr_str_lock); - s = addr_str[i]; - - switch (ss->ss_family) { - case AF_INET: - snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, - (unsigned int)ntohs(in4->sin_port)); - break; - - case AF_INET6: - snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, - (unsigned int)ntohs(in6->sin6_port)); - break; - - default: - sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family); - } - - return s; -} - -static void encode_my_addr(struct ceph_messenger *msgr) -{ - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_addr(&msgr->my_enc_addr); -} - -/* - * work queue for all reading and writing to/from the socket. - */ -struct workqueue_struct *ceph_msgr_wq; - -int __init ceph_msgr_init(void) -{ - ceph_msgr_wq = create_workqueue("ceph-msgr"); - if (IS_ERR(ceph_msgr_wq)) { - int ret = PTR_ERR(ceph_msgr_wq); - pr_err("msgr_init failed to create workqueue: %d\n", ret); - ceph_msgr_wq = NULL; - return ret; - } - return 0; -} - -void ceph_msgr_exit(void) -{ - destroy_workqueue(ceph_msgr_wq); -} - -void ceph_msgr_flush() -{ - flush_workqueue(ceph_msgr_wq); -} - - -/* - * socket callback functions - */ - -/* data available on socket, or listen socket received a connect */ -static void ceph_data_ready(struct sock *sk, int count_unused) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("ceph_data_ready on %p state = %lu, queueing work\n", - con, con->state); - queue_con(con); - } -} - -/* socket has buffer space for writing */ -static void ceph_write_space(struct sock *sk) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - - /* only queue to workqueue if there is data we want to write. */ - if (test_bit(WRITE_PENDING, &con->state)) { - dout("ceph_write_space %p queueing write work\n", con); - queue_con(con); - } else { - dout("ceph_write_space %p nothing to write\n", con); - } - - /* since we have our own write_space, clear the SOCK_NOSPACE flag */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); -} - -/* socket's state has changed */ -static void ceph_state_change(struct sock *sk) -{ - struct ceph_connection *con = - (struct ceph_connection *)sk->sk_user_data; - - dout("ceph_state_change %p state = %lu sk_state = %u\n", - con, con->state, sk->sk_state); - - if (test_bit(CLOSED, &con->state)) - return; - - switch (sk->sk_state) { - case TCP_CLOSE: - dout("ceph_state_change TCP_CLOSE\n"); - case TCP_CLOSE_WAIT: - dout("ceph_state_change TCP_CLOSE_WAIT\n"); - if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { - if (test_bit(CONNECTING, &con->state)) - con->error_msg = "connection failed"; - else - con->error_msg = "socket closed"; - queue_con(con); - } - break; - case TCP_ESTABLISHED: - dout("ceph_state_change TCP_ESTABLISHED\n"); - queue_con(con); - break; - } -} - -/* - * set up socket callbacks - */ -static void set_sock_callbacks(struct socket *sock, - struct ceph_connection *con) -{ - struct sock *sk = sock->sk; - sk->sk_user_data = (void *)con; - sk->sk_data_ready = ceph_data_ready; - sk->sk_write_space = ceph_write_space; - sk->sk_state_change = ceph_state_change; -} - - -/* - * socket helpers - */ - -/* - * initiate connection to a remote socket. - */ -static struct socket *ceph_tcp_connect(struct ceph_connection *con) -{ - struct sockaddr_storage *paddr = &con->peer_addr.in_addr; - struct socket *sock; - int ret; - - BUG_ON(con->sock); - ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); - if (ret) - return ERR_PTR(ret); - con->sock = sock; - sock->sk->sk_allocation = GFP_NOFS; - -#ifdef CONFIG_LOCKDEP - lockdep_set_class(&sock->sk->sk_lock, &socket_class); -#endif - - set_sock_callbacks(sock, con); - - dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - - ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), - O_NONBLOCK); - if (ret == -EINPROGRESS) { - dout("connect %s EINPROGRESS sk_state = %u\n", - pr_addr(&con->peer_addr.in_addr), - sock->sk->sk_state); - ret = 0; - } - if (ret < 0) { - pr_err("connect %s error %d\n", - pr_addr(&con->peer_addr.in_addr), ret); - sock_release(sock); - con->sock = NULL; - con->error_msg = "connect error"; - } - - if (ret < 0) - return ERR_PTR(ret); - return sock; -} - -static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - - return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); -} - -/* - * write something. @more is true if caller will be sending more data - * shortly. - */ -static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, int more) -{ - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - - if (more) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - - return kernel_sendmsg(sock, &msg, iov, kvlen, len); -} - - -/* - * Shutdown/close the socket for the given connection. - */ -static int con_close_socket(struct ceph_connection *con) -{ - int rc; - - dout("con_close_socket on %p sock %p\n", con, con->sock); - if (!con->sock) - return 0; - set_bit(SOCK_CLOSED, &con->state); - rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); - sock_release(con->sock); - con->sock = NULL; - clear_bit(SOCK_CLOSED, &con->state); - return rc; -} - -/* - * Reset a connection. Discard all incoming and outgoing messages - * and clear *_seq state. - */ -static void ceph_msg_remove(struct ceph_msg *msg) -{ - list_del_init(&msg->list_head); - ceph_msg_put(msg); -} -static void ceph_msg_remove_list(struct list_head *head) -{ - while (!list_empty(head)) { - struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, - list_head); - ceph_msg_remove(msg); - } -} - -static void reset_connection(struct ceph_connection *con) -{ - /* reset connection, out_queue, msg_ and connect_seq */ - /* discard existing out_queue and msg_seq */ - ceph_msg_remove_list(&con->out_queue); - ceph_msg_remove_list(&con->out_sent); - - if (con->in_msg) { - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - con->connect_seq = 0; - con->out_seq = 0; - if (con->out_msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } - con->out_keepalive_pending = false; - con->in_seq = 0; - con->in_seq_acked = 0; -} - -/* - * mark a peer down. drop any open connections. - */ -void ceph_con_close(struct ceph_connection *con) -{ - dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ - clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ - clear_bit(KEEPALIVE_PENDING, &con->state); - clear_bit(WRITE_PENDING, &con->state); - mutex_lock(&con->mutex); - reset_connection(con); - con->peer_global_seq = 0; - cancel_delayed_work(&con->work); - mutex_unlock(&con->mutex); - queue_con(con); -} - -/* - * Reopen a closed connection, with a new peer address. - */ -void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) -{ - dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); - set_bit(OPENING, &con->state); - clear_bit(CLOSED, &con->state); - memcpy(&con->peer_addr, addr, sizeof(*addr)); - con->delay = 0; /* reset backoff memory */ - queue_con(con); -} - -/* - * return true if this connection ever successfully opened - */ -bool ceph_con_opened(struct ceph_connection *con) -{ - return con->connect_seq > 0; -} - -/* - * generic get/put - */ -struct ceph_connection *ceph_con_get(struct ceph_connection *con) -{ - dout("con_get %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) + 1); - if (atomic_inc_not_zero(&con->nref)) - return con; - return NULL; -} - -void ceph_con_put(struct ceph_connection *con) -{ - dout("con_put %p nref = %d -> %d\n", con, - atomic_read(&con->nref), atomic_read(&con->nref) - 1); - BUG_ON(atomic_read(&con->nref) == 0); - if (atomic_dec_and_test(&con->nref)) { - BUG_ON(con->sock); - kfree(con); - } -} - -/* - * initialize a new connection. - */ -void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) -{ - dout("con_init %p\n", con); - memset(con, 0, sizeof(*con)); - atomic_set(&con->nref, 1); - con->msgr = msgr; - mutex_init(&con->mutex); - INIT_LIST_HEAD(&con->out_queue); - INIT_LIST_HEAD(&con->out_sent); - INIT_DELAYED_WORK(&con->work, con_work); -} - - -/* - * We maintain a global counter to order connection attempts. Get - * a unique seq greater than @gt. - */ -static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) -{ - u32 ret; - - spin_lock(&msgr->global_seq_lock); - if (msgr->global_seq < gt) - msgr->global_seq = gt; - ret = ++msgr->global_seq; - spin_unlock(&msgr->global_seq_lock); - return ret; -} - - -/* - * Prepare footer for currently outgoing message, and finish things - * off. Assumes out_kvec* are already valid.. we just add on to the end. - */ -static void prepare_write_message_footer(struct ceph_connection *con, int v) -{ - struct ceph_msg *m = con->out_msg; - - dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; - con->out_kvec[v].iov_base = &m->footer; - con->out_kvec[v].iov_len = sizeof(m->footer); - con->out_kvec_bytes += sizeof(m->footer); - con->out_kvec_left++; - con->out_more = m->more_to_follow; - con->out_msg_done = true; -} - -/* - * Prepare headers for the next outgoing message. - */ -static void prepare_write_message(struct ceph_connection *con) -{ - struct ceph_msg *m; - int v = 0; - - con->out_kvec_bytes = 0; - con->out_kvec_is_msg = true; - con->out_msg_done = false; - - /* Sneak an ack in there first? If we can get it into the same - * TCP packet that's a good thing. */ - if (con->in_seq > con->in_seq_acked) { - con->in_seq_acked = con->in_seq; - con->out_kvec[v].iov_base = &tag_ack; - con->out_kvec[v++].iov_len = 1; - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[v].iov_base = &con->out_temp_ack; - con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - } - - m = list_first_entry(&con->out_queue, - struct ceph_msg, list_head); - con->out_msg = m; - if (test_bit(LOSSYTX, &con->state)) { - list_del_init(&m->list_head); - } else { - /* put message on sent list */ - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); - } - - /* - * only assign outgoing seq # if we haven't sent this message - * yet. if it is requeued, resend with it's original seq. - */ - if (m->needs_out_seq) { - m->hdr.seq = cpu_to_le64(++con->out_seq); - m->needs_out_seq = false; - } - - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", - m, con->out_seq, le16_to_cpu(m->hdr.type), - le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->nr_pages); - BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); - - /* tag + hdr + front + middle */ - con->out_kvec[v].iov_base = &tag_msg; - con->out_kvec[v++].iov_len = 1; - con->out_kvec[v].iov_base = &m->hdr; - con->out_kvec[v++].iov_len = sizeof(m->hdr); - con->out_kvec[v++] = m->front; - if (m->middle) - con->out_kvec[v++] = m->middle->vec; - con->out_kvec_left = v; - con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len + - (m->middle ? m->middle->vec.iov_len : 0); - con->out_kvec_cur = con->out_kvec; - - /* fill in crc (except data pages), footer */ - con->out_msg->hdr.crc = - cpu_to_le32(crc32c(0, (void *)&m->hdr, - sizeof(m->hdr) - sizeof(m->hdr.crc))); - con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; - con->out_msg->footer.front_crc = - cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len)); - if (m->middle) - con->out_msg->footer.middle_crc = - cpu_to_le32(crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len)); - else - con->out_msg->footer.middle_crc = 0; - con->out_msg->footer.data_crc = 0; - dout("prepare_write_message front_crc %u data_crc %u\n", - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - - /* is there a data payload? */ - if (le32_to_cpu(m->hdr.data_len) > 0) { - /* initialize page iterator */ - con->out_msg_pos.page = 0; - con->out_msg_pos.page_pos = - le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = 0; - con->out_more = 1; /* data + footer will follow */ - } else { - /* no, queue up footer too and be done */ - prepare_write_message_footer(con, v); - } - - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Prepare an ack. - */ -static void prepare_write_ack(struct ceph_connection *con) -{ - dout("prepare_write_ack %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con->out_kvec[0].iov_base = &tag_ack; - con->out_kvec[0].iov_len = 1; - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con->out_kvec[1].iov_base = &con->out_temp_ack; - con->out_kvec[1].iov_len = sizeof(con->out_temp_ack); - con->out_kvec_left = 2; - con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack); - con->out_kvec_cur = con->out_kvec; - con->out_more = 1; /* more will follow.. eventually.. */ - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Prepare to write keepalive byte. - */ -static void prepare_write_keepalive(struct ceph_connection *con) -{ - dout("prepare_write_keepalive %p\n", con); - con->out_kvec[0].iov_base = &tag_keepalive; - con->out_kvec[0].iov_len = 1; - con->out_kvec_left = 1; - con->out_kvec_bytes = 1; - con->out_kvec_cur = con->out_kvec; - set_bit(WRITE_PENDING, &con->state); -} - -/* - * Connection negotiation. - */ - -static void prepare_connect_authorizer(struct ceph_connection *con) -{ - void *auth_buf; - int auth_len = 0; - int auth_protocol = 0; - - mutex_unlock(&con->mutex); - if (con->ops->get_authorizer) - con->ops->get_authorizer(con, &auth_buf, &auth_len, - &auth_protocol, &con->auth_reply_buf, - &con->auth_reply_buf_len, - con->auth_retry); - mutex_lock(&con->mutex); - - con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol); - con->out_connect.authorizer_len = cpu_to_le32(auth_len); - - con->out_kvec[con->out_kvec_left].iov_base = auth_buf; - con->out_kvec[con->out_kvec_left].iov_len = auth_len; - con->out_kvec_left++; - con->out_kvec_bytes += auth_len; -} - -/* - * We connected to a peer and are saying hello. - */ -static void prepare_write_banner(struct ceph_messenger *msgr, - struct ceph_connection *con) -{ - int len = strlen(CEPH_BANNER); - - con->out_kvec[0].iov_base = CEPH_BANNER; - con->out_kvec[0].iov_len = len; - con->out_kvec[1].iov_base = &msgr->my_enc_addr; - con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr); - con->out_kvec_left = 2; - con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr); - con->out_kvec_cur = con->out_kvec; - con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); -} - -static void prepare_write_connect(struct ceph_messenger *msgr, - struct ceph_connection *con, - int after_banner) -{ - unsigned global_seq = get_global_seq(con->msgr, 0); - int proto; - - switch (con->peer_name.type) { - case CEPH_ENTITY_TYPE_MON: - proto = CEPH_MONC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_OSD: - proto = CEPH_OSDC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_MDS: - proto = CEPH_MDSC_PROTOCOL; - break; - default: - BUG(); - } - - dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, - con->connect_seq, global_seq, proto); - - con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); - con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = cpu_to_le32(global_seq); - con->out_connect.protocol_version = cpu_to_le32(proto); - con->out_connect.flags = 0; - - if (!after_banner) { - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - } - con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect; - con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect); - con->out_kvec_left++; - con->out_kvec_bytes += sizeof(con->out_connect); - con->out_kvec_cur = con->out_kvec; - con->out_more = 0; - set_bit(WRITE_PENDING, &con->state); - - prepare_connect_authorizer(con); -} - - -/* - * write as much of pending kvecs to the socket as we can. - * 1 -> done - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_kvec(struct ceph_connection *con) -{ - int ret; - - dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); - while (con->out_kvec_bytes > 0) { - ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, - con->out_kvec_left, con->out_kvec_bytes, - con->out_more); - if (ret <= 0) - goto out; - con->out_kvec_bytes -= ret; - if (con->out_kvec_bytes == 0) - break; /* done */ - while (ret > 0) { - if (ret >= con->out_kvec_cur->iov_len) { - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } else { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - ret = 0; - break; - } - } - } - con->out_kvec_left = 0; - con->out_kvec_is_msg = false; - ret = 1; -out: - dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, - con->out_kvec_bytes, con->out_kvec_left, ret); - return ret; /* done! */ -} - -/* - * Write as much message data payload as we can. If we finish, queue - * up the footer. - * 1 -> done, footer is now queued in out_kvec[]. - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_msg_pages(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->out_msg; - unsigned data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; - int crc = con->msgr->nocrc; - int ret; - - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, - con->out_msg_pos.page_pos); - - while (con->out_msg_pos.page < con->out_msg->nr_pages) { - struct page *page = NULL; - void *kaddr = NULL; - - /* - * if we are calculating the data crc (the default), we need - * to map the page. if our pages[] has been revoked, use the - * zero page. - */ - if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; - if (crc) - kaddr = kmap(page); - } else if (msg->pagelist) { - page = list_first_entry(&msg->pagelist->head, - struct page, lru); - if (crc) - kaddr = kmap(page); - } else { - page = con->msgr->zero_page; - if (crc) - kaddr = page_address(con->msgr->zero_page); - } - len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), - (int)(data_len - con->out_msg_pos.data_pos)); - if (crc && !con->out_msg_pos.did_page_crc) { - void *base = kaddr + con->out_msg_pos.page_pos; - u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); - - BUG_ON(kaddr == NULL); - con->out_msg->footer.data_crc = - cpu_to_le32(crc32c(tmpcrc, base, len)); - con->out_msg_pos.did_page_crc = 1; - } - - ret = kernel_sendpage(con->sock, page, - con->out_msg_pos.page_pos, len, - MSG_DONTWAIT | MSG_NOSIGNAL | - MSG_MORE); - - if (crc && (msg->pages || msg->pagelist)) - kunmap(page); - - if (ret <= 0) - goto out; - - con->out_msg_pos.data_pos += ret; - con->out_msg_pos.page_pos += ret; - if (ret == len) { - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = 0; - if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); - } - } - - dout("write_partial_msg_pages %p msg %p done\n", con, msg); - - /* prepare and queue up footer, too */ - if (!crc) - con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con->out_kvec_bytes = 0; - con->out_kvec_left = 0; - con->out_kvec_cur = con->out_kvec; - prepare_write_message_footer(con, 0); - ret = 1; -out: - return ret; -} - -/* - * write some zeros - */ -static int write_partial_skip(struct ceph_connection *con) -{ - int ret; - - while (con->out_skip > 0) { - struct kvec iov = { - .iov_base = page_address(con->msgr->zero_page), - .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE) - }; - - ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1); - if (ret <= 0) - goto out; - con->out_skip -= ret; - } - ret = 1; -out: - return ret; -} - -/* - * Prepare to read connection handshake, or an ack. - */ -static void prepare_read_banner(struct ceph_connection *con) -{ - dout("prepare_read_banner %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_connect(struct ceph_connection *con) -{ - dout("prepare_read_connect %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_ack(struct ceph_connection *con) -{ - dout("prepare_read_ack %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_tag(struct ceph_connection *con) -{ - dout("prepare_read_tag %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_READY; -} - -/* - * Prepare to read a message. - */ -static int prepare_read_message(struct ceph_connection *con) -{ - dout("prepare_read_message %p\n", con); - BUG_ON(con->in_msg != NULL); - con->in_base_pos = 0; - con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; - return 0; -} - - -static int read_partial(struct ceph_connection *con, - int *to, int size, void *object) -{ - *to += size; - while (con->in_base_pos < *to) { - int left = *to - con->in_base_pos; - int have = size - left; - int ret = ceph_tcp_recvmsg(con->sock, object + have, left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - return 1; -} - - -/* - * Read all or part of the connect-side handshake on a new connection - */ -static int read_partial_banner(struct ceph_connection *con) -{ - int ret, to = 0; - - dout("read_partial_banner %p at %d\n", con, con->in_base_pos); - - /* peer's banner */ - ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, sizeof(con->actual_peer_addr), - &con->actual_peer_addr); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, sizeof(con->peer_addr_for_me), - &con->peer_addr_for_me); - if (ret <= 0) - goto out; -out: - return ret; -} - -static int read_partial_connect(struct ceph_connection *con) -{ - int ret, to = 0; - - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - - ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply); - if (ret <= 0) - goto out; - ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len), - con->auth_reply_buf); - if (ret <= 0) - goto out; - - dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", - con, (int)con->in_reply.tag, - le32_to_cpu(con->in_reply.connect_seq), - le32_to_cpu(con->in_reply.global_seq)); -out: - return ret; - -} - -/* - * Verify the hello banner looks okay. - */ -static int verify_hello(struct ceph_connection *con) -{ - if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to %s got bad banner\n", - pr_addr(&con->peer_addr.in_addr)); - con->error_msg = "protocol error, bad banner"; - return -1; - } - return 0; -} - -static bool addr_is_blank(struct sockaddr_storage *ss) -{ - switch (ss->ss_family) { - case AF_INET: - return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; - case AF_INET6: - return - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && - ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; - } - return false; -} - -static int addr_port(struct sockaddr_storage *ss) -{ - switch (ss->ss_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)ss)->sin_port); - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); - } - return 0; -} - -static void addr_set_port(struct sockaddr_storage *ss, int p) -{ - switch (ss->ss_family) { - case AF_INET: - ((struct sockaddr_in *)ss)->sin_port = htons(p); - case AF_INET6: - ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); - } -} - -/* - * Parse an ip[:port] list into an addr array. Use the default - * monitor port if a port isn't specified. - */ -int ceph_parse_ips(const char *c, const char *end, - struct ceph_entity_addr *addr, - int max_count, int *count) -{ - int i; - const char *p = c; - - dout("parse_ips on '%.*s'\n", (int)(end-c), c); - for (i = 0; i < max_count; i++) { - const char *ipend; - struct sockaddr_storage *ss = &addr[i].in_addr; - struct sockaddr_in *in4 = (void *)ss; - struct sockaddr_in6 *in6 = (void *)ss; - int port; - char delim = ','; - - if (*p == '[') { - delim = ']'; - p++; - } - - memset(ss, 0, sizeof(*ss)); - if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - delim, &ipend)) - ss->ss_family = AF_INET; - else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - delim, &ipend)) - ss->ss_family = AF_INET6; - else - goto bad; - p = ipend; - - if (delim == ']') { - if (*p != ']') { - dout("missing matching ']'\n"); - goto bad; - } - p++; - } - - /* port? */ - if (p < end && *p == ':') { - port = 0; - p++; - while (p < end && *p >= '0' && *p <= '9') { - port = (port * 10) + (*p - '0'); - p++; - } - if (port > 65535 || port == 0) - goto bad; - } else { - port = CEPH_MON_PORT; - } - - addr_set_port(ss, port); - - dout("parse_ips got %s\n", pr_addr(ss)); - - if (p == end) - break; - if (*p != ',') - goto bad; - p++; - } - - if (p != end) - goto bad; - - if (count) - *count = i + 1; - return 0; - -bad: - pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); - return -EINVAL; -} - -static int process_banner(struct ceph_connection *con) -{ - dout("process_banner on %p\n", con); - - if (verify_hello(con) < 0) - return -1; - - ceph_decode_addr(&con->actual_peer_addr); - ceph_decode_addr(&con->peer_addr_for_me); - - /* - * Make sure the other end is who we wanted. note that the other - * end may not yet know their ip address, so if it's 0.0.0.0, give - * them the benefit of the doubt. - */ - if (memcmp(&con->peer_addr, &con->actual_peer_addr, - sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr.in_addr) && - con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", - pr_addr(&con->peer_addr.in_addr), - le64_to_cpu(con->peer_addr.nonce), - pr_addr(&con->actual_peer_addr.in_addr), - le64_to_cpu(con->actual_peer_addr.nonce)); - con->error_msg = "wrong peer at address"; - return -1; - } - - /* - * did we learn our address? - */ - if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { - int port = addr_port(&con->msgr->inst.addr.in_addr); - - memcpy(&con->msgr->inst.addr.in_addr, - &con->peer_addr_for_me.in_addr, - sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr.in_addr, port); - encode_my_addr(con->msgr); - dout("process_banner learned my addr is %s\n", - pr_addr(&con->msgr->inst.addr.in_addr)); - } - - set_bit(NEGOTIATING, &con->state); - prepare_read_connect(con); - return 0; -} - -static void fail_protocol(struct ceph_connection *con) -{ - reset_connection(con); - set_bit(CLOSED, &con->state); /* in case there's queued work */ - - mutex_unlock(&con->mutex); - if (con->ops->bad_proto) - con->ops->bad_proto(con); - mutex_lock(&con->mutex); -} - -static int process_connect(struct ceph_connection *con) -{ - u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; - u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; - u64 server_feat = le64_to_cpu(con->in_reply.features); - - dout("process_connect on %p tag %d\n", con, (int)con->in_tag); - - switch (con->in_reply.tag) { - case CEPH_MSGR_TAG_FEATURES: - pr_err("%s%lld %s feature set mismatch," - " my %llx < server's %llx, missing %llx\n", - ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), - sup_feat, server_feat, server_feat & ~sup_feat); - con->error_msg = "missing required protocol features"; - fail_protocol(con); - return -1; - - case CEPH_MSGR_TAG_BADPROTOVER: - pr_err("%s%lld %s protocol version mismatch," - " my %d != server's %d\n", - ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), - le32_to_cpu(con->out_connect.protocol_version), - le32_to_cpu(con->in_reply.protocol_version)); - con->error_msg = "protocol version mismatch"; - fail_protocol(con); - return -1; - - case CEPH_MSGR_TAG_BADAUTHORIZER: - con->auth_retry++; - dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, - con->auth_retry); - if (con->auth_retry == 2) { - con->error_msg = "connect authorization failure"; - reset_connection(con); - set_bit(CLOSED, &con->state); - return -1; - } - con->auth_retry = 1; - prepare_write_connect(con->msgr, con, 0); - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RESETSESSION: - /* - * If we connected with a large connect_seq but the peer - * has no record of a session with us (no connection, or - * connect_seq == 0), they will send RESETSESION to indicate - * that they must have reset their session, and may have - * dropped messages. - */ - dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_connect.connect_seq)); - pr_err("%s%lld %s connection reset\n", - ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr)); - reset_connection(con); - prepare_write_connect(con->msgr, con, 0); - prepare_read_connect(con); - - /* Tell ceph about it. */ - mutex_unlock(&con->mutex); - pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); - if (con->ops->peer_reset) - con->ops->peer_reset(con); - mutex_lock(&con->mutex); - break; - - case CEPH_MSGR_TAG_RETRY_SESSION: - /* - * If we sent a smaller connect_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", - le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_connect.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); - prepare_write_connect(con->msgr, con, 0); - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RETRY_GLOBAL: - /* - * If we sent a smaller global_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", - con->peer_global_seq, - le32_to_cpu(con->in_connect.global_seq)); - get_global_seq(con->msgr, - le32_to_cpu(con->in_connect.global_seq)); - prepare_write_connect(con->msgr, con, 0); - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_READY: - if (req_feat & ~server_feat) { - pr_err("%s%lld %s protocol feature mismatch," - " my required %llx > server's %llx, need %llx\n", - ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), - req_feat, server_feat, req_feat & ~server_feat); - con->error_msg = "missing required protocol features"; - fail_protocol(con); - return -1; - } - clear_bit(CONNECTING, &con->state); - con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); - con->connect_seq++; - con->peer_features = server_feat; - dout("process_connect got READY gseq %d cseq %d (%d)\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.connect_seq), - con->connect_seq); - WARN_ON(con->connect_seq != - le32_to_cpu(con->in_reply.connect_seq)); - - if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - set_bit(LOSSYTX, &con->state); - - prepare_read_tag(con); - break; - - case CEPH_MSGR_TAG_WAIT: - /* - * If there is a connection race (we are opening - * connections to each other), one of us may just have - * to WAIT. This shouldn't happen if we are the - * client. - */ - pr_err("process_connect peer connecting WAIT\n"); - - default: - pr_err("connect protocol error, will retry\n"); - con->error_msg = "protocol error, garbage tag during connect"; - return -1; - } - return 0; -} - - -/* - * read (part of) an ack - */ -static int read_partial_ack(struct ceph_connection *con) -{ - int to = 0; - - return read_partial(con, &to, sizeof(con->in_temp_ack), - &con->in_temp_ack); -} - - -/* - * We can finally discard anything that's been acked. - */ -static void process_ack(struct ceph_connection *con) -{ - struct ceph_msg *m; - u64 ack = le64_to_cpu(con->in_temp_ack); - u64 seq; - - while (!list_empty(&con->out_sent)) { - m = list_first_entry(&con->out_sent, struct ceph_msg, - list_head); - seq = le64_to_cpu(m->hdr.seq); - if (seq > ack) - break; - dout("got ack for seq %llu type %d at %p\n", seq, - le16_to_cpu(m->hdr.type), m); - ceph_msg_remove(m); - } - prepare_read_tag(con); -} - - - - -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, unsigned int sec_len, - u32 *crc) -{ - int left; - int ret; - - BUG_ON(!section); - - while (section->iov_len < sec_len) { - BUG_ON(section->iov_base == NULL); - left = sec_len - section->iov_len; - ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + - section->iov_len, left); - if (ret <= 0) - return ret; - section->iov_len += ret; - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, - section->iov_len); - } - - return 1; -} - -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip); -/* - * read (part of) a message. - */ -static int read_partial_message(struct ceph_connection *con) -{ - struct ceph_msg *m = con->in_msg; - void *p; - int ret; - int to, left; - unsigned front_len, middle_len, data_len, data_off; - int datacrc = con->msgr->nocrc; - int skip; - u64 seq; - - dout("read_partial_message con %p msg %p\n", con, m); - - /* header */ - while (con->in_base_pos < sizeof(con->in_hdr)) { - left = sizeof(con->in_hdr) - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, - (char *)&con->in_hdr + con->in_base_pos, - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - if (con->in_base_pos == sizeof(con->in_hdr)) { - u32 crc = crc32c(0, (void *)&con->in_hdr, - sizeof(con->in_hdr) - sizeof(con->in_hdr.crc)); - if (crc != le32_to_cpu(con->in_hdr.crc)) { - pr_err("read_partial_message bad hdr " - " crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - } - } - front_len = le32_to_cpu(con->in_hdr.front_len); - if (front_len > CEPH_MSG_MAX_FRONT_LEN) - return -EIO; - middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_DATA_LEN) - return -EIO; - data_len = le32_to_cpu(con->in_hdr.data_len); - if (data_len > CEPH_MSG_MAX_DATA_LEN) - return -EIO; - data_off = le16_to_cpu(con->in_hdr.data_off); - - /* verify seq# */ - seq = le64_to_cpu(con->in_hdr.seq); - if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld, expected %lld\n", - ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), - seq, con->in_seq + 1); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - return 0; - } else if ((s64)seq - (s64)con->in_seq > 1) { - pr_err("read_partial_message bad seq %lld expected %lld\n", - seq, con->in_seq + 1); - con->error_msg = "bad message sequence # for incoming message"; - return -EBADMSG; - } - - /* allocate message? */ - if (!con->in_msg) { - dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - con->in_hdr.front_len, con->in_hdr.data_len); - skip = 0; - con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); - if (skip) { - /* skip this message */ - dout("alloc_msg said skip message\n"); - BUG_ON(con->in_msg); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof(m->footer); - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } - m = con->in_msg; - m->front.iov_len = 0; /* haven't read it yet */ - if (m->middle) - m->middle->vec.iov_len = 0; - - con->in_msg_pos.page = 0; - con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; - con->in_msg_pos.data_pos = 0; - } - - /* front */ - ret = read_partial_message_section(con, &m->front, front_len, - &con->in_front_crc); - if (ret <= 0) - return ret; - - /* middle */ - if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, middle_len, - &con->in_middle_crc); - if (ret <= 0) - return ret; - } - - /* (page) data */ - while (con->in_msg_pos.data_pos < data_len) { - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - BUG_ON(m->pages == NULL); - p = kmap(m->pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(m->pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; - } - } - - /* footer */ - to = sizeof(m->hdr) + sizeof(m->footer); - while (con->in_base_pos < to) { - left = to - con->in_base_pos; - ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer + - (con->in_base_pos - sizeof(m->hdr)), - left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", - m, front_len, m->footer.front_crc, middle_len, - m->footer.middle_crc, data_len, m->footer.data_crc); - - /* crc ok? */ - if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { - pr_err("read_partial_message %p front crc %u != exp. %u\n", - m, con->in_front_crc, m->footer.front_crc); - return -EBADMSG; - } - if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { - pr_err("read_partial_message %p middle crc %u != exp %u\n", - m, con->in_middle_crc, m->footer.middle_crc); - return -EBADMSG; - } - if (datacrc && - (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && - con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { - pr_err("read_partial_message %p data crc %u != exp. %u\n", m, - con->in_data_crc, le32_to_cpu(m->footer.data_crc)); - return -EBADMSG; - } - - return 1; /* done! */ -} - -/* - * Process message. This happens in the worker thread. The callback should - * be careful not to do anything that waits on other incoming messages or it - * may deadlock. - */ -static void process_message(struct ceph_connection *con) -{ - struct ceph_msg *msg; - - msg = con->in_msg; - con->in_msg = NULL; - - /* if first message, set peer_name */ - if (con->peer_name.type == 0) - con->peer_name = msg->hdr.src; - - con->in_seq++; - mutex_unlock(&con->mutex); - - dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", - msg, le64_to_cpu(msg->hdr.seq), - ENTITY_NAME(msg->hdr.src), - le16_to_cpu(msg->hdr.type), - ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), - le32_to_cpu(msg->hdr.front_len), - le32_to_cpu(msg->hdr.data_len), - con->in_front_crc, con->in_middle_crc, con->in_data_crc); - con->ops->dispatch(con, msg); - - mutex_lock(&con->mutex); - prepare_read_tag(con); -} - - -/* - * Write something to the socket. Called in a worker thread when the - * socket appears to be writeable and we have something ready to send. - */ -static int try_write(struct ceph_connection *con) -{ - struct ceph_messenger *msgr = con->msgr; - int ret = 1; - - dout("try_write start %p state %lu nref %d\n", con, con->state, - atomic_read(&con->nref)); - -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - - /* open the socket first? */ - if (con->sock == NULL) { - /* - * if we were STANDBY and are reconnecting _this_ - * connection, bump connect_seq now. Always bump - * global_seq. - */ - if (test_and_clear_bit(STANDBY, &con->state)) - con->connect_seq++; - - prepare_write_banner(msgr, con); - prepare_write_connect(msgr, con, 1); - prepare_read_banner(con); - set_bit(CONNECTING, &con->state); - clear_bit(NEGOTIATING, &con->state); - - BUG_ON(con->in_msg); - con->in_tag = CEPH_MSGR_TAG_READY; - dout("try_write initiating connect on %p new state %lu\n", - con, con->state); - con->sock = ceph_tcp_connect(con); - if (IS_ERR(con->sock)) { - con->sock = NULL; - con->error_msg = "connect error"; - ret = -1; - goto out; - } - } - -more_kvec: - /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); - if (ret <= 0) - goto done; - if (ret < 0) { - dout("try_write write_partial_skip err %d\n", ret); - goto done; - } - } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); - if (ret <= 0) - goto done; - } - - /* msg pages? */ - if (con->out_msg) { - if (con->out_msg_done) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; /* we're done with this one */ - goto do_next; - } - - ret = write_partial_msg_pages(con); - if (ret == 1) - goto more_kvec; /* we need to send the footer, too! */ - if (ret == 0) - goto done; - if (ret < 0) { - dout("try_write write_partial_msg_pages err %d\n", - ret); - goto done; - } - } - -do_next: - if (!test_bit(CONNECTING, &con->state)) { - /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); - goto more; - } - if (con->in_seq > con->in_seq_acked) { - prepare_write_ack(con); - goto more; - } - if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { - prepare_write_keepalive(con); - goto more; - } - } - - /* Nothing to do! */ - clear_bit(WRITE_PENDING, &con->state); - dout("try_write nothing else to write.\n"); -done: - ret = 0; -out: - dout("try_write done on %p\n", con); - return ret; -} - - - -/* - * Read what we can from the socket. - */ -static int try_read(struct ceph_connection *con) -{ - int ret = -1; - - if (!con->sock) - return 0; - - if (test_bit(STANDBY, &con->state)) - return 0; - - dout("try_read start on %p\n", con); - -more: - dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, - con->in_base_pos); - if (test_bit(CONNECTING, &con->state)) { - if (!test_bit(NEGOTIATING, &con->state)) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto done; - if (process_banner(con) < 0) { - ret = -1; - goto out; - } - } - ret = read_partial_connect(con); - if (ret <= 0) - goto done; - if (process_connect(con) < 0) { - ret = -1; - goto out; - } - goto more; - } - - if (con->in_base_pos < 0) { - /* - * skipping + discarding content. - * - * FIXME: there must be a better way to do this! - */ - static char buf[1024]; - int skip = min(1024, -con->in_base_pos); - dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); - ret = ceph_tcp_recvmsg(con->sock, buf, skip); - if (ret <= 0) - goto done; - con->in_base_pos += ret; - if (con->in_base_pos) - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) { - /* - * what's next? - */ - ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); - if (ret <= 0) - goto done; - dout("try_read got tag %d\n", (int)con->in_tag); - switch (con->in_tag) { - case CEPH_MSGR_TAG_MSG: - prepare_read_message(con); - break; - case CEPH_MSGR_TAG_ACK: - prepare_read_ack(con); - break; - case CEPH_MSGR_TAG_CLOSE: - set_bit(CLOSED, &con->state); /* fixme */ - goto done; - default: - goto bad_tag; - } - } - if (con->in_tag == CEPH_MSGR_TAG_MSG) { - ret = read_partial_message(con); - if (ret <= 0) { - switch (ret) { - case -EBADMSG: - con->error_msg = "bad crc"; - ret = -EIO; - goto out; - case -EIO: - con->error_msg = "io error"; - goto out; - default: - goto done; - } - } - if (con->in_tag == CEPH_MSGR_TAG_READY) - goto more; - process_message(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_ACK) { - ret = read_partial_ack(con); - if (ret <= 0) - goto done; - process_ack(con); - goto more; - } - -done: - ret = 0; -out: - dout("try_read done on %p\n", con); - return ret; - -bad_tag: - pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); - con->error_msg = "protocol error, garbage tag"; - ret = -1; - goto out; -} - - -/* - * Atomically queue work on a connection. Bump @con reference to - * avoid races with connection teardown. - * - * There is some trickery going on with QUEUED and BUSY because we - * only want a _single_ thread operating on each connection at any - * point in time, but we want to use all available CPUs. - * - * The worker thread only proceeds if it can atomically set BUSY. It - * clears QUEUED and does it's thing. When it thinks it's done, it - * clears BUSY, then rechecks QUEUED.. if it's set again, it loops - * (tries again to set BUSY). - * - * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we - * try to queue work. If that fails (work is already queued, or BUSY) - * we give up (work also already being done or is queued) but leave QUEUED - * set so that the worker thread will loop if necessary. - */ -static void queue_con(struct ceph_connection *con) -{ - if (test_bit(DEAD, &con->state)) { - dout("queue_con %p ignoring: DEAD\n", - con); - return; - } - - if (!con->ops->get(con)) { - dout("queue_con %p ref count 0\n", con); - return; - } - - set_bit(QUEUED, &con->state); - if (test_bit(BUSY, &con->state)) { - dout("queue_con %p - already BUSY\n", con); - con->ops->put(con); - } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { - dout("queue_con %p - already queued\n", con); - con->ops->put(con); - } else { - dout("queue_con %p\n", con); - } -} - -/* - * Do some work on a connection. Drop a connection ref when we're done. - */ -static void con_work(struct work_struct *work) -{ - struct ceph_connection *con = container_of(work, struct ceph_connection, - work.work); - int backoff = 0; - -more: - if (test_and_set_bit(BUSY, &con->state) != 0) { - dout("con_work %p BUSY already set\n", con); - goto out; - } - dout("con_work %p start, clearing QUEUED\n", con); - clear_bit(QUEUED, &con->state); - - mutex_lock(&con->mutex); - - if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ - dout("con_work CLOSED\n"); - con_close_socket(con); - goto done; - } - if (test_and_clear_bit(OPENING, &con->state)) { - /* reopen w/ new peer */ - dout("con_work OPENING\n"); - con_close_socket(con); - } - - if (test_and_clear_bit(SOCK_CLOSED, &con->state) || - try_read(con) < 0 || - try_write(con) < 0) { - mutex_unlock(&con->mutex); - backoff = 1; - ceph_fault(con); /* error/fault path */ - goto done_unlocked; - } - -done: - mutex_unlock(&con->mutex); - -done_unlocked: - clear_bit(BUSY, &con->state); - dout("con->state=%lu\n", con->state); - if (test_bit(QUEUED, &con->state)) { - if (!backoff || test_bit(OPENING, &con->state)) { - dout("con_work %p QUEUED reset, looping\n", con); - goto more; - } - dout("con_work %p QUEUED reset, but just faulted\n", con); - clear_bit(QUEUED, &con->state); - } - dout("con_work %p done\n", con); - -out: - con->ops->put(con); -} - - -/* - * Generic error/fault handler. A retry mechanism is used with - * exponential backoff - */ -static void ceph_fault(struct ceph_connection *con) -{ - pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), - pr_addr(&con->peer_addr.in_addr), con->error_msg); - dout("fault %p state %lu to peer %s\n", - con, con->state, pr_addr(&con->peer_addr.in_addr)); - - if (test_bit(LOSSYTX, &con->state)) { - dout("fault on LOSSYTX channel\n"); - goto out; - } - - mutex_lock(&con->mutex); - if (test_bit(CLOSED, &con->state)) - goto out_unlock; - - con_close_socket(con); - - if (con->in_msg) { - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - /* Requeue anything that hasn't been acked */ - list_splice_init(&con->out_sent, &con->out_queue); - - /* If there are no messages in the queue, place the connection - * in a STANDBY state (i.e., don't try to reconnect just yet). */ - if (list_empty(&con->out_queue) && !con->out_keepalive_pending) { - dout("fault setting STANDBY\n"); - set_bit(STANDBY, &con->state); - } else { - /* retry after a delay. */ - if (con->delay == 0) - con->delay = BASE_DELAY_INTERVAL; - else if (con->delay < MAX_DELAY_INTERVAL) - con->delay *= 2; - dout("fault queueing %p delay %lu\n", con, con->delay); - con->ops->get(con); - if (queue_delayed_work(ceph_msgr_wq, &con->work, - round_jiffies_relative(con->delay)) == 0) - con->ops->put(con); - } - -out_unlock: - mutex_unlock(&con->mutex); -out: - /* - * in case we faulted due to authentication, invalidate our - * current tickets so that we can get new ones. - */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); - } - - if (con->ops->fault) - con->ops->fault(con); -} - - - -/* - * create a new messenger instance - */ -struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) -{ - struct ceph_messenger *msgr; - - msgr = kzalloc(sizeof(*msgr), GFP_KERNEL); - if (msgr == NULL) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&msgr->global_seq_lock); - - /* the zero page is needed if a request is "canceled" while the message - * is being written over the socket */ - msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); - if (!msgr->zero_page) { - kfree(msgr); - return ERR_PTR(-ENOMEM); - } - kmap(msgr->zero_page); - - if (myaddr) - msgr->inst.addr = *myaddr; - - /* select a random nonce */ - msgr->inst.addr.type = 0; - get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); - encode_my_addr(msgr); - - dout("messenger_create %p\n", msgr); - return msgr; -} - -void ceph_messenger_destroy(struct ceph_messenger *msgr) -{ - dout("destroy %p\n", msgr); - kunmap(msgr->zero_page); - __free_page(msgr->zero_page); - kfree(msgr); - dout("destroyed messenger %p\n", msgr); -} - -/* - * Queue up an outgoing message on the given connection. - */ -void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) -{ - if (test_bit(CLOSED, &con->state)) { - dout("con_send %p closed, dropping %p\n", con, msg); - ceph_msg_put(msg); - return; - } - - /* set src+dst */ - msg->hdr.src = con->msgr->inst.name; - - BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); - - msg->needs_out_seq = true; - - /* queue */ - mutex_lock(&con->mutex); - BUG_ON(!list_empty(&msg->list_head)); - list_add_tail(&msg->list_head, &con->out_queue); - dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, - ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), - ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), - le32_to_cpu(msg->hdr.front_len), - le32_to_cpu(msg->hdr.middle_len), - le32_to_cpu(msg->hdr.data_len)); - mutex_unlock(&con->mutex); - - /* if there wasn't anything waiting to send before, queue - * new work */ - if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) - queue_con(con); -} - -/* - * Revoke a message that was previously queued for send - */ -void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) -{ - mutex_lock(&con->mutex); - if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p - was on queue\n", con, msg); - list_del_init(&msg->list_head); - ceph_msg_put(msg); - msg->hdr.seq = 0; - } - if (con->out_msg == msg) { - dout("con_revoke %p msg %p - was sending\n", con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; - } - ceph_msg_put(msg); - msg->hdr.seq = 0; - } - mutex_unlock(&con->mutex); -} - -/* - * Revoke a message that we may be reading data into - */ -void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) -{ - mutex_lock(&con->mutex); - if (con->in_msg && con->in_msg == msg) { - unsigned front_len = le32_to_cpu(con->in_hdr.front_len); - unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len); - unsigned data_len = le32_to_cpu(con->in_hdr.data_len); - - /* skip rest of message */ - dout("con_revoke_pages %p msg %p revoked\n", con, msg); - con->in_base_pos = con->in_base_pos - - sizeof(struct ceph_msg_header) - - front_len - - middle_len - - data_len - - sizeof(struct ceph_msg_footer); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - } else { - dout("con_revoke_pages %p msg %p pages %p no-op\n", - con, con->in_msg, msg); - } - mutex_unlock(&con->mutex); -} - -/* - * Queue a keepalive byte to ensure the tcp connection is alive. - */ -void ceph_con_keepalive(struct ceph_connection *con) -{ - if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && - test_and_set_bit(WRITE_PENDING, &con->state) == 0) - queue_con(con); -} - - -/* - * construct a new message with given type, size - * the new msg has a ref count of 1. - */ -struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) -{ - struct ceph_msg *m; - - m = kmalloc(sizeof(*m), flags); - if (m == NULL) - goto out; - kref_init(&m->kref); - INIT_LIST_HEAD(&m->list_head); - - m->hdr.tid = 0; - m->hdr.type = cpu_to_le16(type); - m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); - m->hdr.version = 0; - m->hdr.front_len = cpu_to_le32(front_len); - m->hdr.middle_len = 0; - m->hdr.data_len = 0; - m->hdr.data_off = 0; - m->hdr.reserved = 0; - m->footer.front_crc = 0; - m->footer.middle_crc = 0; - m->footer.data_crc = 0; - m->footer.flags = 0; - m->front_max = front_len; - m->front_is_vmalloc = false; - m->more_to_follow = false; - m->pool = NULL; - - /* front */ - if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } - if (m->front.iov_base == NULL) { - pr_err("msg_new can't allocate %d bytes\n", - front_len); - goto out2; - } - } else { - m->front.iov_base = NULL; - } - m->front.iov_len = front_len; - - /* middle */ - m->middle = NULL; - - /* data */ - m->nr_pages = 0; - m->pages = NULL; - m->pagelist = NULL; - - dout("ceph_msg_new %p front %d\n", m, front_len); - return m; - -out2: - ceph_msg_put(m); -out: - pr_err("msg_new can't create type %d front %d\n", type, front_len); - return NULL; -} - -/* - * Allocate "middle" portion of a message, if it is needed and wasn't - * allocated by alloc_msg. This allows us to read a small fixed-size - * per-type header in the front and then gracefully fail (i.e., - * propagate the error to the caller based on info in the front) when - * the middle is too large. - */ -static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) -{ - int type = le16_to_cpu(msg->hdr.type); - int middle_len = le32_to_cpu(msg->hdr.middle_len); - - dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, - ceph_msg_type_name(type), middle_len); - BUG_ON(!middle_len); - BUG_ON(msg->middle); - - msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); - if (!msg->middle) - return -ENOMEM; - return 0; -} - -/* - * Generic message allocator, for incoming messages. - */ -static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) -{ - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); - int middle_len = le32_to_cpu(hdr->middle_len); - struct ceph_msg *msg = NULL; - int ret; - - if (con->ops->alloc_msg) { - mutex_unlock(&con->mutex); - msg = con->ops->alloc_msg(con, hdr, skip); - mutex_lock(&con->mutex); - if (!msg || *skip) - return NULL; - } - if (!msg) { - *skip = 0; - msg = ceph_msg_new(type, front_len, GFP_NOFS); - if (!msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return NULL; - } - } - memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - - if (middle_len && !msg->middle) { - ret = ceph_alloc_middle(con, msg); - if (ret < 0) { - ceph_msg_put(msg); - return NULL; - } - } - - return msg; -} - - -/* - * Free a generically kmalloc'd message. - */ -void ceph_msg_kfree(struct ceph_msg *m) -{ - dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); - kfree(m); -} - -/* - * Drop a msg ref. Destroy as needed. - */ -void ceph_msg_last_put(struct kref *kref) -{ - struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - - dout("ceph_msg_put last one on %p\n", m); - WARN_ON(!list_empty(&m->list_head)); - - /* drop middle, data, if any */ - if (m->middle) { - ceph_buffer_put(m->middle); - m->middle = NULL; - } - m->nr_pages = 0; - m->pages = NULL; - - if (m->pagelist) { - ceph_pagelist_release(m->pagelist); - kfree(m->pagelist); - m->pagelist = NULL; - } - - if (m->pool) - ceph_msgpool_put(m->pool, m); - else - ceph_msg_kfree(m); -} - -void ceph_msg_dump(struct ceph_msg *msg) -{ - pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, - msg->front_max, msg->nr_pages); - print_hex_dump(KERN_DEBUG, "header: ", - DUMP_PREFIX_OFFSET, 16, 1, - &msg->hdr, sizeof(msg->hdr), true); - print_hex_dump(KERN_DEBUG, " front: ", - DUMP_PREFIX_OFFSET, 16, 1, - msg->front.iov_base, msg->front.iov_len, true); - if (msg->middle) - print_hex_dump(KERN_DEBUG, "middle: ", - DUMP_PREFIX_OFFSET, 16, 1, - msg->middle->vec.iov_base, - msg->middle->vec.iov_len, true); - print_hex_dump(KERN_DEBUG, "footer: ", - DUMP_PREFIX_OFFSET, 16, 1, - &msg->footer, sizeof(msg->footer), true); -} diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h deleted file mode 100644 index 76fbc957bc1..00000000000 --- a/fs/ceph/messenger.h +++ /dev/null @@ -1,253 +0,0 @@ -#ifndef __FS_CEPH_MESSENGER_H -#define __FS_CEPH_MESSENGER_H - -#include <linux/kref.h> -#include <linux/mutex.h> -#include <linux/net.h> -#include <linux/radix-tree.h> -#include <linux/uio.h> -#include <linux/version.h> -#include <linux/workqueue.h> - -#include "types.h" -#include "buffer.h" - -struct ceph_msg; -struct ceph_connection; - -extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */ - -/* - * Ceph defines these callbacks for handling connection events. - */ -struct ceph_connection_operations { - struct ceph_connection *(*get)(struct ceph_connection *); - void (*put)(struct ceph_connection *); - - /* handle an incoming message. */ - void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m); - - /* authorize an outgoing connection */ - int (*get_authorizer) (struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new); - int (*verify_authorizer_reply) (struct ceph_connection *con, int len); - int (*invalidate_authorizer)(struct ceph_connection *con); - - /* protocol version mismatch */ - void (*bad_proto) (struct ceph_connection *con); - - /* there was some error on the socket (disconnect, whatever) */ - void (*fault) (struct ceph_connection *con); - - /* a remote host as terminated a message exchange session, and messages - * we sent (or they tried to send us) may be lost. */ - void (*peer_reset) (struct ceph_connection *con); - - struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip); -}; - -/* use format string %s%d */ -#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) - -struct ceph_messenger { - struct ceph_entity_inst inst; /* my name+address */ - struct ceph_entity_addr my_enc_addr; - struct page *zero_page; /* used in certain error cases */ - - bool nocrc; - - /* - * the global_seq counts connections i (attempt to) initiate - * in order to disambiguate certain connect race conditions. - */ - u32 global_seq; - spinlock_t global_seq_lock; -}; - -/* - * a single message. it contains a header (src, dest, message type, etc.), - * footer (crc values, mainly), a "front" message body, and possibly a - * data payload (stored in some number of pages). - */ -struct ceph_msg { - struct ceph_msg_header hdr; /* header */ - struct ceph_msg_footer footer; /* footer */ - struct kvec front; /* unaligned blobs of message */ - struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned nr_pages; /* size of page array */ - struct ceph_pagelist *pagelist; /* instead of pages */ - struct list_head list_head; - struct kref kref; - bool front_is_vmalloc; - bool more_to_follow; - bool needs_out_seq; - int front_max; - - struct ceph_msgpool *pool; -}; - -struct ceph_msg_pos { - int page, page_pos; /* which page; offset in page */ - int data_pos; /* offset in data payload */ - int did_page_crc; /* true if we've calculated crc for current page */ -}; - -/* ceph connection fault delay defaults, for exponential backoff */ -#define BASE_DELAY_INTERVAL (HZ/2) -#define MAX_DELAY_INTERVAL (5 * 60 * HZ) - -/* - * ceph_connection state bit flags - * - * QUEUED and BUSY are used together to ensure that only a single - * thread is currently opening, reading or writing data to the socket. - */ -#define LOSSYTX 0 /* we can close channel or drop messages on errors */ -#define CONNECTING 1 -#define NEGOTIATING 2 -#define KEEPALIVE_PENDING 3 -#define WRITE_PENDING 4 /* we have data ready to send */ -#define QUEUED 5 /* there is work queued on this connection */ -#define BUSY 6 /* work is being done */ -#define STANDBY 8 /* no outgoing messages, socket closed. we keep - * the ceph_connection around to maintain shared - * state with the peer. */ -#define CLOSED 10 /* we've closed the connection */ -#define SOCK_CLOSED 11 /* socket state changed to closed */ -#define OPENING 13 /* open connection w/ (possibly new) peer */ -#define DEAD 14 /* dead, about to kfree */ - -/* - * A single connection with another host. - * - * We maintain a queue of outgoing messages, and some session state to - * ensure that we can preserve the lossless, ordered delivery of - * messages in the case of a TCP disconnect. - */ -struct ceph_connection { - void *private; - atomic_t nref; - - const struct ceph_connection_operations *ops; - - struct ceph_messenger *msgr; - struct socket *sock; - unsigned long state; /* connection state (see flags above) */ - const char *error_msg; /* error message, if any */ - - struct ceph_entity_addr peer_addr; /* peer address */ - struct ceph_entity_name peer_name; /* peer name */ - struct ceph_entity_addr peer_addr_for_me; - unsigned peer_features; - u32 connect_seq; /* identify the most recent connection - attempt for this connection, client */ - u32 peer_global_seq; /* peer's global seq for this connection */ - - int auth_retry; /* true if we need a newer authorizer */ - void *auth_reply_buf; /* where to put the authorizer reply */ - int auth_reply_buf_len; - - struct mutex mutex; - - /* out queue */ - struct list_head out_queue; - struct list_head out_sent; /* sending or sent but unacked */ - u64 out_seq; /* last message queued for send */ - bool out_keepalive_pending; - - u64 in_seq, in_seq_acked; /* last message received, acked */ - - /* connection negotiation temps */ - char in_banner[CEPH_BANNER_MAX_LEN]; - union { - struct { /* outgoing connection */ - struct ceph_msg_connect out_connect; - struct ceph_msg_connect_reply in_reply; - }; - struct { /* incoming */ - struct ceph_msg_connect in_connect; - struct ceph_msg_connect_reply out_reply; - }; - }; - struct ceph_entity_addr actual_peer_addr; - - /* message out temps */ - struct ceph_msg *out_msg; /* sending message (== tail of - out_sent) */ - bool out_msg_done; - struct ceph_msg_pos out_msg_pos; - - struct kvec out_kvec[8], /* sending header/footer data */ - *out_kvec_cur; - int out_kvec_left; /* kvec's left in out_kvec */ - int out_skip; /* skip this many bytes */ - int out_kvec_bytes; /* total bytes left */ - bool out_kvec_is_msg; /* kvec refers to out_msg */ - int out_more; /* there is more data after the kvecs */ - __le64 out_temp_ack; /* for writing an ack */ - - /* message in temps */ - struct ceph_msg_header in_hdr; - struct ceph_msg *in_msg; - struct ceph_msg_pos in_msg_pos; - u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ - - char in_tag; /* protocol control byte */ - int in_base_pos; /* bytes read */ - __le64 in_temp_ack; /* for reading an ack */ - - struct delayed_work work; /* send|recv work */ - unsigned long delay; /* current delay interval */ -}; - - -extern const char *pr_addr(const struct sockaddr_storage *ss); -extern int ceph_parse_ips(const char *c, const char *end, - struct ceph_entity_addr *addr, - int max_count, int *count); - - -extern int ceph_msgr_init(void); -extern void ceph_msgr_exit(void); -extern void ceph_msgr_flush(void); - -extern struct ceph_messenger *ceph_messenger_create( - struct ceph_entity_addr *myaddr); -extern void ceph_messenger_destroy(struct ceph_messenger *); - -extern void ceph_con_init(struct ceph_messenger *msgr, - struct ceph_connection *con); -extern void ceph_con_open(struct ceph_connection *con, - struct ceph_entity_addr *addr); -extern bool ceph_con_opened(struct ceph_connection *con); -extern void ceph_con_close(struct ceph_connection *con); -extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); -extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); -extern void ceph_con_revoke_message(struct ceph_connection *con, - struct ceph_msg *msg); -extern void ceph_con_keepalive(struct ceph_connection *con); -extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); -extern void ceph_con_put(struct ceph_connection *con); - -extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); -extern void ceph_msg_kfree(struct ceph_msg *m); - - -static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) -{ - kref_get(&msg->kref); - return msg; -} -extern void ceph_msg_last_put(struct kref *kref); -static inline void ceph_msg_put(struct ceph_msg *msg) -{ - kref_put(&msg->kref, ceph_msg_last_put); -} - -extern void ceph_msg_dump(struct ceph_msg *msg); - -#endif diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c deleted file mode 100644 index cc115eafae1..00000000000 --- a/fs/ceph/mon_client.c +++ /dev/null @@ -1,882 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/sched.h> - -#include "mon_client.h" -#include "super.h" -#include "auth.h" -#include "decode.h" - -/* - * Interact with Ceph monitor cluster. Handle requests for new map - * versions, and periodically resend as needed. Also implement - * statfs() and umount(). - * - * A small cluster of Ceph "monitors" are responsible for managing critical - * cluster configuration and state information. An odd number (e.g., 3, 5) - * of cmon daemons use a modified version of the Paxos part-time parliament - * algorithm to manage the MDS map (mds cluster membership), OSD map, and - * list of clients who have mounted the file system. - * - * We maintain an open, active session with a monitor at all times in order to - * receive timely MDSMap updates. We periodically send a keepalive byte on the - * TCP socket to ensure we detect a failure. If the connection does break, we - * randomly hunt for a new monitor. Once the connection is reestablished, we - * resend any outstanding requests. - */ - -static const struct ceph_connection_operations mon_con_ops; - -static int __validate_auth(struct ceph_mon_client *monc); - -/* - * Decode a monmap blob (e.g., during mount). - */ -struct ceph_monmap *ceph_monmap_decode(void *p, void *end) -{ - struct ceph_monmap *m = NULL; - int i, err = -EINVAL; - struct ceph_fsid fsid; - u32 epoch, num_mon; - u16 version; - u32 len; - - ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); - - dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); - - ceph_decode_16_safe(&p, end, version, bad); - - ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); - ceph_decode_copy(&p, &fsid, sizeof(fsid)); - epoch = ceph_decode_32(&p); - - num_mon = ceph_decode_32(&p); - ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); - - if (num_mon >= CEPH_MAX_MON) - goto bad; - m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); - if (m == NULL) - return ERR_PTR(-ENOMEM); - m->fsid = fsid; - m->epoch = epoch; - m->num_mon = num_mon; - ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); - for (i = 0; i < num_mon; i++) - ceph_decode_addr(&m->mon_inst[i].addr); - - dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, - m->num_mon); - for (i = 0; i < m->num_mon; i++) - dout("monmap_decode mon%d is %s\n", i, - pr_addr(&m->mon_inst[i].addr.in_addr)); - return m; - -bad: - dout("monmap_decode failed with %d\n", err); - kfree(m); - return ERR_PTR(err); -} - -/* - * return true if *addr is included in the monmap. - */ -int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) -{ - int i; - - for (i = 0; i < m->num_mon; i++) - if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) - return 1; - return 0; -} - -/* - * Send an auth request. - */ -static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) -{ - monc->pending_auth = 1; - monc->m_auth->front.iov_len = len; - monc->m_auth->hdr.front_len = cpu_to_le32(len); - ceph_con_revoke(monc->con, monc->m_auth); - ceph_msg_get(monc->m_auth); /* keep our ref */ - ceph_con_send(monc->con, monc->m_auth); -} - -/* - * Close monitor session, if any. - */ -static void __close_session(struct ceph_mon_client *monc) -{ - if (monc->con) { - dout("__close_session closing mon%d\n", monc->cur_mon); - ceph_con_revoke(monc->con, monc->m_auth); - ceph_con_close(monc->con); - monc->cur_mon = -1; - monc->pending_auth = 0; - ceph_auth_reset(monc->auth); - } -} - -/* - * Open a session with a (new) monitor. - */ -static int __open_session(struct ceph_mon_client *monc) -{ - char r; - int ret; - - if (monc->cur_mon < 0) { - get_random_bytes(&r, 1); - monc->cur_mon = r % monc->monmap->num_mon; - dout("open_session num=%d r=%d -> mon%d\n", - monc->monmap->num_mon, r, monc->cur_mon); - monc->sub_sent = 0; - monc->sub_renew_after = jiffies; /* i.e., expired */ - monc->want_next_osdmap = !!monc->want_next_osdmap; - - dout("open_session mon%d opening\n", monc->cur_mon); - monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; - monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); - ceph_con_open(monc->con, - &monc->monmap->mon_inst[monc->cur_mon].addr); - - /* initiatiate authentication handshake */ - ret = ceph_auth_build_hello(monc->auth, - monc->m_auth->front.iov_base, - monc->m_auth->front_max); - __send_prepared_auth_request(monc, ret); - } else { - dout("open_session mon%d already open\n", monc->cur_mon); - } - return 0; -} - -static bool __sub_expired(struct ceph_mon_client *monc) -{ - return time_after_eq(jiffies, monc->sub_renew_after); -} - -/* - * Reschedule delayed work timer. - */ -static void __schedule_delayed(struct ceph_mon_client *monc) -{ - unsigned delay; - - if (monc->cur_mon < 0 || __sub_expired(monc)) - delay = 10 * HZ; - else - delay = 20 * HZ; - dout("__schedule_delayed after %u\n", delay); - schedule_delayed_work(&monc->delayed_work, delay); -} - -/* - * Send subscribe request for mdsmap and/or osdmap. - */ -static void __send_subscribe(struct ceph_mon_client *monc) -{ - dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", - (unsigned)monc->sub_sent, __sub_expired(monc), - monc->want_next_osdmap); - if ((__sub_expired(monc) && !monc->sub_sent) || - monc->want_next_osdmap == 1) { - struct ceph_msg *msg = monc->m_subscribe; - struct ceph_mon_subscribe_item *i; - void *p, *end; - - p = msg->front.iov_base; - end = p + msg->front_max; - - dout("__send_subscribe to 'mdsmap' %u+\n", - (unsigned)monc->have_mdsmap); - if (monc->want_next_osdmap) { - dout("__send_subscribe to 'osdmap' %u\n", - (unsigned)monc->have_osdmap); - ceph_encode_32(&p, 3); - ceph_encode_string(&p, end, "osdmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_osdmap); - i->onetime = 1; - p += sizeof(*i); - monc->want_next_osdmap = 2; /* requested */ - } else { - ceph_encode_32(&p, 2); - } - ceph_encode_string(&p, end, "mdsmap", 6); - i = p; - i->have = cpu_to_le64(monc->have_mdsmap); - i->onetime = 0; - p += sizeof(*i); - ceph_encode_string(&p, end, "monmap", 6); - i = p; - i->have = 0; - i->onetime = 0; - p += sizeof(*i); - - msg->front.iov_len = p - msg->front.iov_base; - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_revoke(monc->con, msg); - ceph_con_send(monc->con, ceph_msg_get(msg)); - - monc->sub_sent = jiffies | 1; /* never 0 */ - } -} - -static void handle_subscribe_ack(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - unsigned seconds; - struct ceph_mon_subscribe_ack *h = msg->front.iov_base; - - if (msg->front.iov_len < sizeof(*h)) - goto bad; - seconds = le32_to_cpu(h->duration); - - mutex_lock(&monc->mutex); - if (monc->hunting) { - pr_info("mon%d %s session established\n", - monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); - monc->hunting = false; - } - dout("handle_subscribe_ack after %d seconds\n", seconds); - monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; - monc->sub_sent = 0; - mutex_unlock(&monc->mutex); - return; -bad: - pr_err("got corrupt subscribe-ack msg\n"); - ceph_msg_dump(msg); -} - -/* - * Keep track of which maps we have - */ -int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) -{ - mutex_lock(&monc->mutex); - monc->have_mdsmap = got; - mutex_unlock(&monc->mutex); - return 0; -} - -int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) -{ - mutex_lock(&monc->mutex); - monc->have_osdmap = got; - monc->want_next_osdmap = 0; - mutex_unlock(&monc->mutex); - return 0; -} - -/* - * Register interest in the next osdmap - */ -void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) -{ - dout("request_next_osdmap have %u\n", monc->have_osdmap); - mutex_lock(&monc->mutex); - if (!monc->want_next_osdmap) - monc->want_next_osdmap = 1; - if (monc->want_next_osdmap < 2) - __send_subscribe(monc); - mutex_unlock(&monc->mutex); -} - -/* - * - */ -int ceph_monc_open_session(struct ceph_mon_client *monc) -{ - if (!monc->con) { - monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); - if (!monc->con) - return -ENOMEM; - ceph_con_init(monc->client->msgr, monc->con); - monc->con->private = monc; - monc->con->ops = &mon_con_ops; - } - - mutex_lock(&monc->mutex); - __open_session(monc); - __schedule_delayed(monc); - mutex_unlock(&monc->mutex); - return 0; -} - -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ -static void ceph_monc_handle_map(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - struct ceph_client *client = monc->client; - struct ceph_monmap *monmap = NULL, *old = monc->monmap; - void *p, *end; - - mutex_lock(&monc->mutex); - - dout("handle_monmap\n"); - p = msg->front.iov_base; - end = p + msg->front.iov_len; - - monmap = ceph_monmap_decode(p, end); - if (IS_ERR(monmap)) { - pr_err("problem decoding monmap, %d\n", - (int)PTR_ERR(monmap)); - goto out; - } - - if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { - kfree(monmap); - goto out; - } - - client->monc.monmap = monmap; - kfree(old); - -out: - mutex_unlock(&monc->mutex); - wake_up(&client->auth_wq); -} - -/* - * statfs - */ -static struct ceph_mon_generic_request *__lookup_generic_req( - struct ceph_mon_client *monc, u64 tid) -{ - struct ceph_mon_generic_request *req; - struct rb_node *n = monc->generic_request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mon_generic_request, node); - if (tid < req->tid) - n = n->rb_left; - else if (tid > req->tid) - n = n->rb_right; - else - return req; - } - return NULL; -} - -static void __insert_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *new) -{ - struct rb_node **p = &monc->generic_request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mon_generic_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mon_generic_request, node); - if (new->tid < req->tid) - p = &(*p)->rb_left; - else if (new->tid > req->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->generic_request_tree); -} - -static void release_generic_request(struct kref *kref) -{ - struct ceph_mon_generic_request *req = - container_of(kref, struct ceph_mon_generic_request, kref); - - if (req->reply) - ceph_msg_put(req->reply); - if (req->request) - ceph_msg_put(req->request); - - kfree(req); -} - -static void put_generic_request(struct ceph_mon_generic_request *req) -{ - kref_put(&req->kref, release_generic_request); -} - -static void get_generic_request(struct ceph_mon_generic_request *req) -{ - kref_get(&req->kref); -} - -static struct ceph_msg *get_generic_reply(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) -{ - struct ceph_mon_client *monc = con->private; - struct ceph_mon_generic_request *req; - u64 tid = le64_to_cpu(hdr->tid); - struct ceph_msg *m; - - mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); - if (!req) { - dout("get_generic_reply %lld dne\n", tid); - *skip = 1; - m = NULL; - } else { - dout("get_generic_reply %lld got %p\n", tid, req->reply); - m = ceph_msg_get(req->reply); - /* - * we don't need to track the connection reading into - * this reply because we only have one open connection - * at a time, ever. - */ - } - mutex_unlock(&monc->mutex); - return m; -} - -static void handle_statfs_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_statfs_reply *reply = msg->front.iov_base; - u64 tid = le64_to_cpu(msg->hdr.tid); - - if (msg->front.iov_len != sizeof(*reply)) - goto bad; - dout("handle_statfs_reply %p tid %llu\n", msg, tid); - - mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); - if (req) { - *(struct ceph_statfs *)req->buf = reply->st; - req->result = 0; - get_generic_request(req); - } - mutex_unlock(&monc->mutex); - if (req) { - complete(&req->completion); - put_generic_request(req); - } - return; - -bad: - pr_err("corrupt generic reply, no tid\n"); - ceph_msg_dump(msg); -} - -/* - * Do a synchronous statfs(). - */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_statfs *h; - int err; - - req = kzalloc(sizeof(*req), GFP_NOFS); - if (!req) - return -ENOMEM; - - kref_init(&req->kref); - req->buf = buf; - init_completion(&req->completion); - - err = -ENOMEM; - req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); - if (!req->request) - goto out; - req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); - if (!req->reply) - goto out; - - /* fill out request */ - h = req->request->front.iov_base; - h->monhdr.have_version = 0; - h->monhdr.session_mon = cpu_to_le16(-1); - h->monhdr.session_mon_tid = 0; - h->fsid = monc->monmap->fsid; - - /* register request */ - mutex_lock(&monc->mutex); - req->tid = ++monc->last_tid; - req->request->hdr.tid = cpu_to_le64(req->tid); - __insert_generic_request(monc, req); - monc->num_generic_requests++; - mutex_unlock(&monc->mutex); - - /* send request and wait */ - ceph_con_send(monc->con, ceph_msg_get(req->request)); - err = wait_for_completion_interruptible(&req->completion); - - mutex_lock(&monc->mutex); - rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; - mutex_unlock(&monc->mutex); - - if (!err) - err = req->result; - -out: - kref_put(&req->kref, release_generic_request); - return err; -} - -/* - * Resend pending statfs requests. - */ -static void __resend_generic_request(struct ceph_mon_client *monc) -{ - struct ceph_mon_generic_request *req; - struct rb_node *p; - - for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_mon_generic_request, node); - ceph_con_revoke(monc->con, req->request); - ceph_con_send(monc->con, ceph_msg_get(req->request)); - } -} - -/* - * Delayed work. If we haven't mounted yet, retry. Otherwise, - * renew/retry subscription as needed (in case it is timing out, or we - * got an ENOMEM). And keep the monitor connection alive. - */ -static void delayed_work(struct work_struct *work) -{ - struct ceph_mon_client *monc = - container_of(work, struct ceph_mon_client, delayed_work.work); - - dout("monc delayed_work\n"); - mutex_lock(&monc->mutex); - if (monc->hunting) { - __close_session(monc); - __open_session(monc); /* continue hunting */ - } else { - ceph_con_keepalive(monc->con); - - __validate_auth(monc); - - if (monc->auth->ops->is_authenticated(monc->auth)) - __send_subscribe(monc); - } - __schedule_delayed(monc); - mutex_unlock(&monc->mutex); -} - -/* - * On startup, we build a temporary monmap populated with the IPs - * provided by mount(2). - */ -static int build_initial_monmap(struct ceph_mon_client *monc) -{ - struct ceph_mount_args *args = monc->client->mount_args; - struct ceph_entity_addr *mon_addr = args->mon_addr; - int num_mon = args->num_mon; - int i; - - /* build initial monmap */ - monc->monmap = kzalloc(sizeof(*monc->monmap) + - num_mon*sizeof(monc->monmap->mon_inst[0]), - GFP_KERNEL); - if (!monc->monmap) - return -ENOMEM; - for (i = 0; i < num_mon; i++) { - monc->monmap->mon_inst[i].addr = mon_addr[i]; - monc->monmap->mon_inst[i].addr.nonce = 0; - monc->monmap->mon_inst[i].name.type = - CEPH_ENTITY_TYPE_MON; - monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); - } - monc->monmap->num_mon = num_mon; - monc->have_fsid = false; - - /* release addr memory */ - kfree(args->mon_addr); - args->mon_addr = NULL; - args->num_mon = 0; - return 0; -} - -int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) -{ - int err = 0; - - dout("init\n"); - memset(monc, 0, sizeof(*monc)); - monc->client = cl; - monc->monmap = NULL; - mutex_init(&monc->mutex); - - err = build_initial_monmap(monc); - if (err) - goto out; - - monc->con = NULL; - - /* authentication */ - monc->auth = ceph_auth_init(cl->mount_args->name, - cl->mount_args->secret); - if (IS_ERR(monc->auth)) - return PTR_ERR(monc->auth); - monc->auth->want_keys = - CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | - CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; - - /* msgs */ - err = -ENOMEM; - monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, - sizeof(struct ceph_mon_subscribe_ack), - GFP_NOFS); - if (!monc->m_subscribe_ack) - goto out_monmap; - - monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); - if (!monc->m_subscribe) - goto out_subscribe_ack; - - monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); - if (!monc->m_auth_reply) - goto out_subscribe; - - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); - monc->pending_auth = 0; - if (!monc->m_auth) - goto out_auth_reply; - - monc->cur_mon = -1; - monc->hunting = true; - monc->sub_renew_after = jiffies; - monc->sub_sent = 0; - - INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - monc->generic_request_tree = RB_ROOT; - monc->num_generic_requests = 0; - monc->last_tid = 0; - - monc->have_mdsmap = 0; - monc->have_osdmap = 0; - monc->want_next_osdmap = 1; - return 0; - -out_auth_reply: - ceph_msg_put(monc->m_auth_reply); -out_subscribe: - ceph_msg_put(monc->m_subscribe); -out_subscribe_ack: - ceph_msg_put(monc->m_subscribe_ack); -out_monmap: - kfree(monc->monmap); -out: - return err; -} - -void ceph_monc_stop(struct ceph_mon_client *monc) -{ - dout("stop\n"); - cancel_delayed_work_sync(&monc->delayed_work); - - mutex_lock(&monc->mutex); - __close_session(monc); - if (monc->con) { - monc->con->private = NULL; - monc->con->ops->put(monc->con); - monc->con = NULL; - } - mutex_unlock(&monc->mutex); - - ceph_auth_destroy(monc->auth); - - ceph_msg_put(monc->m_auth); - ceph_msg_put(monc->m_auth_reply); - ceph_msg_put(monc->m_subscribe); - ceph_msg_put(monc->m_subscribe_ack); - - kfree(monc->monmap); -} - -static void handle_auth_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - int ret; - int was_auth = 0; - - mutex_lock(&monc->mutex); - if (monc->auth->ops) - was_auth = monc->auth->ops->is_authenticated(monc->auth); - monc->pending_auth = 0; - ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, - msg->front.iov_len, - monc->m_auth->front.iov_base, - monc->m_auth->front_max); - if (ret < 0) { - monc->client->auth_err = ret; - wake_up(&monc->client->auth_wq); - } else if (ret > 0) { - __send_prepared_auth_request(monc, ret); - } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { - dout("authenticated, starting session\n"); - - monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; - monc->client->msgr->inst.name.num = - cpu_to_le64(monc->auth->global_id); - - __send_subscribe(monc); - __resend_generic_request(monc); - } - mutex_unlock(&monc->mutex); -} - -static int __validate_auth(struct ceph_mon_client *monc) -{ - int ret; - - if (monc->pending_auth) - return 0; - - ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); - if (ret <= 0) - return ret; /* either an error, or no need to authenticate */ - __send_prepared_auth_request(monc, ret); - return 0; -} - -int ceph_monc_validate_auth(struct ceph_mon_client *monc) -{ - int ret; - - mutex_lock(&monc->mutex); - ret = __validate_auth(monc); - mutex_unlock(&monc->mutex); - return ret; -} - -/* - * handle incoming message - */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) -{ - struct ceph_mon_client *monc = con->private; - int type = le16_to_cpu(msg->hdr.type); - - if (!monc) - return; - - switch (type) { - case CEPH_MSG_AUTH_REPLY: - handle_auth_reply(monc, msg); - break; - - case CEPH_MSG_MON_SUBSCRIBE_ACK: - handle_subscribe_ack(monc, msg); - break; - - case CEPH_MSG_STATFS_REPLY: - handle_statfs_reply(monc, msg); - break; - - case CEPH_MSG_MON_MAP: - ceph_monc_handle_map(monc, msg); - break; - - case CEPH_MSG_MDS_MAP: - ceph_mdsc_handle_map(&monc->client->mdsc, msg); - break; - - case CEPH_MSG_OSD_MAP: - ceph_osdc_handle_map(&monc->client->osdc, msg); - break; - - default: - pr_err("received unknown message type %d %s\n", type, - ceph_msg_type_name(type)); - } - ceph_msg_put(msg); -} - -/* - * Allocate memory for incoming message - */ -static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) -{ - struct ceph_mon_client *monc = con->private; - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); - struct ceph_msg *m = NULL; - - *skip = 0; - - switch (type) { - case CEPH_MSG_MON_SUBSCRIBE_ACK: - m = ceph_msg_get(monc->m_subscribe_ack); - break; - case CEPH_MSG_STATFS_REPLY: - return get_generic_reply(con, hdr, skip); - case CEPH_MSG_AUTH_REPLY: - m = ceph_msg_get(monc->m_auth_reply); - break; - case CEPH_MSG_MON_MAP: - case CEPH_MSG_MDS_MAP: - case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, GFP_NOFS); - break; - } - - if (!m) { - pr_info("alloc_msg unknown type %d\n", type); - *skip = 1; - } - return m; -} - -/* - * If the monitor connection resets, pick a new monitor and resubmit - * any pending requests. - */ -static void mon_fault(struct ceph_connection *con) -{ - struct ceph_mon_client *monc = con->private; - - if (!monc) - return; - - dout("mon_fault\n"); - mutex_lock(&monc->mutex); - if (!con->private) - goto out; - - if (monc->con && !monc->hunting) - pr_info("mon%d %s session lost, " - "hunting for new mon\n", monc->cur_mon, - pr_addr(&monc->con->peer_addr.in_addr)); - - __close_session(monc); - if (!monc->hunting) { - /* start hunting */ - monc->hunting = true; - __open_session(monc); - } else { - /* already hunting, let's wait a bit */ - __schedule_delayed(monc); - } -out: - mutex_unlock(&monc->mutex); -} - -static const struct ceph_connection_operations mon_con_ops = { - .get = ceph_con_get, - .put = ceph_con_put, - .dispatch = dispatch, - .fault = mon_fault, - .alloc_msg = mon_alloc_msg, -}; diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h deleted file mode 100644 index 174d794321d..00000000000 --- a/fs/ceph/mon_client.h +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef _FS_CEPH_MON_CLIENT_H -#define _FS_CEPH_MON_CLIENT_H - -#include <linux/completion.h> -#include <linux/kref.h> -#include <linux/rbtree.h> - -#include "messenger.h" - -struct ceph_client; -struct ceph_mount_args; -struct ceph_auth_client; - -/* - * The monitor map enumerates the set of all monitors. - */ -struct ceph_monmap { - struct ceph_fsid fsid; - u32 epoch; - u32 num_mon; - struct ceph_entity_inst mon_inst[0]; -}; - -struct ceph_mon_client; -struct ceph_mon_generic_request; - - -/* - * Generic mechanism for resending monitor requests. - */ -typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc, - int newmon); - -/* a pending monitor request */ -struct ceph_mon_request { - struct ceph_mon_client *monc; - struct delayed_work delayed_work; - unsigned long delay; - ceph_monc_request_func_t do_request; -}; - -/* - * ceph_mon_generic_request is being used for the statfs and poolop requests - * which are bening done a bit differently because we need to get data back - * to the caller - */ -struct ceph_mon_generic_request { - struct kref kref; - u64 tid; - struct rb_node node; - int result; - void *buf; - struct completion completion; - struct ceph_msg *request; /* original request */ - struct ceph_msg *reply; /* and reply */ -}; - -struct ceph_mon_client { - struct ceph_client *client; - struct ceph_monmap *monmap; - - struct mutex mutex; - struct delayed_work delayed_work; - - struct ceph_auth_client *auth; - struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; - int pending_auth; - - bool hunting; - int cur_mon; /* last monitor i contacted */ - unsigned long sub_sent, sub_renew_after; - struct ceph_connection *con; - bool have_fsid; - - /* pending generic requests */ - struct rb_root generic_request_tree; - int num_generic_requests; - u64 last_tid; - - /* mds/osd map */ - int want_next_osdmap; /* 1 = want, 2 = want+asked */ - u32 have_osdmap, have_mdsmap; - -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif -}; - -extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end); -extern int ceph_monmap_contains(struct ceph_monmap *m, - struct ceph_entity_addr *addr); - -extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); -extern void ceph_monc_stop(struct ceph_mon_client *monc); - -/* - * The model here is to indicate that we need a new map of at least - * epoch @want, and also call in when we receive a map. We will - * periodically rerequest the map from the monitor cluster until we - * get what we want. - */ -extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); -extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); - -extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); - -extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, - struct ceph_statfs *buf); - -extern int ceph_monc_open_session(struct ceph_mon_client *monc); - -extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); - - - -#endif diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c deleted file mode 100644 index dd65a643813..00000000000 --- a/fs/ceph/msgpool.c +++ /dev/null @@ -1,64 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/err.h> -#include <linux/sched.h> -#include <linux/types.h> -#include <linux/vmalloc.h> - -#include "msgpool.h" - -static void *alloc_fn(gfp_t gfp_mask, void *arg) -{ - struct ceph_msgpool *pool = arg; - void *p; - - p = ceph_msg_new(0, pool->front_len, gfp_mask); - if (!p) - pr_err("msgpool %s alloc failed\n", pool->name); - return p; -} - -static void free_fn(void *element, void *arg) -{ - ceph_msg_put(element); -} - -int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking, const char *name) -{ - pool->front_len = front_len; - pool->pool = mempool_create(size, alloc_fn, free_fn, pool); - if (!pool->pool) - return -ENOMEM; - pool->name = name; - return 0; -} - -void ceph_msgpool_destroy(struct ceph_msgpool *pool) -{ - mempool_destroy(pool->pool); -} - -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, - int front_len) -{ - if (front_len > pool->front_len) { - pr_err("msgpool_get pool %s need front %d, pool size is %d\n", - pool->name, front_len, pool->front_len); - WARN_ON(1); - - /* try to alloc a fresh message */ - return ceph_msg_new(0, front_len, GFP_NOFS); - } - - return mempool_alloc(pool->pool, GFP_NOFS); -} - -void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) -{ - /* reset msg front_len; user may have changed it */ - msg->front.iov_len = pool->front_len; - msg->hdr.front_len = cpu_to_le32(pool->front_len); - - kref_init(&msg->kref); /* retake single ref */ -} diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h deleted file mode 100644 index a362605f936..00000000000 --- a/fs/ceph/msgpool.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FS_CEPH_MSGPOOL -#define _FS_CEPH_MSGPOOL - -#include <linux/mempool.h> -#include "messenger.h" - -/* - * we use memory pools for preallocating messages we may receive, to - * avoid unexpected OOM conditions. - */ -struct ceph_msgpool { - const char *name; - mempool_t *pool; - int front_len; /* preallocated payload size */ -}; - -extern int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking, - const char *name); -extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, - int front_len); -extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); - -#endif diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h deleted file mode 100644 index 892a0298dfd..00000000000 --- a/fs/ceph/msgr.h +++ /dev/null @@ -1,175 +0,0 @@ -#ifndef __MSGR_H -#define __MSGR_H - -/* - * Data types for message passing layer used by Ceph. - */ - -#define CEPH_MON_PORT 6789 /* default monitor port */ - -/* - * client-side processes will try to bind to ports in this - * range, simply for the benefit of tools like nmap or wireshark - * that would like to identify the protocol. - */ -#define CEPH_PORT_FIRST 6789 -#define CEPH_PORT_START 6800 /* non-monitors start here */ -#define CEPH_PORT_LAST 6900 - -/* - * tcp connection banner. include a protocol version. and adjust - * whenever the wire protocol changes. try to keep this string length - * constant. - */ -#define CEPH_BANNER "ceph v027" -#define CEPH_BANNER_MAX_LEN 30 - - -/* - * Rollover-safe type and comparator for 32-bit sequence numbers. - * Comparator returns -1, 0, or 1. - */ -typedef __u32 ceph_seq_t; - -static inline __s32 ceph_seq_cmp(__u32 a, __u32 b) -{ - return (__s32)a - (__s32)b; -} - - -/* - * entity_name -- logical name for a process participating in the - * network, e.g. 'mds0' or 'osd3'. - */ -struct ceph_entity_name { - __u8 type; /* CEPH_ENTITY_TYPE_* */ - __le64 num; -} __attribute__ ((packed)); - -#define CEPH_ENTITY_TYPE_MON 0x01 -#define CEPH_ENTITY_TYPE_MDS 0x02 -#define CEPH_ENTITY_TYPE_OSD 0x04 -#define CEPH_ENTITY_TYPE_CLIENT 0x08 -#define CEPH_ENTITY_TYPE_AUTH 0x20 - -#define CEPH_ENTITY_TYPE_ANY 0xFF - -extern const char *ceph_entity_type_name(int type); - -/* - * entity_addr -- network address - */ -struct ceph_entity_addr { - __le32 type; - __le32 nonce; /* unique id for process (e.g. pid) */ - struct sockaddr_storage in_addr; -} __attribute__ ((packed)); - -struct ceph_entity_inst { - struct ceph_entity_name name; - struct ceph_entity_addr addr; -} __attribute__ ((packed)); - - -/* used by message exchange protocol */ -#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */ -#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */ -#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing - incoming connection */ -#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again - with higher cseq */ -#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again - with higher gseq */ -#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */ -#define CEPH_MSGR_TAG_MSG 7 /* message */ -#define CEPH_MSGR_TAG_ACK 8 /* message ack */ -#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ -#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ -#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ -#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ - - -/* - * connection negotiation - */ -struct ceph_msg_connect { - __le64 features; /* supported feature bits */ - __le32 host_type; /* CEPH_ENTITY_TYPE_* */ - __le32 global_seq; /* count connections initiated by this host */ - __le32 connect_seq; /* count connections initiated in this session */ - __le32 protocol_version; - __le32 authorizer_protocol; - __le32 authorizer_len; - __u8 flags; /* CEPH_MSG_CONNECT_* */ -} __attribute__ ((packed)); - -struct ceph_msg_connect_reply { - __u8 tag; - __le64 features; /* feature bits for this session */ - __le32 global_seq; - __le32 connect_seq; - __le32 protocol_version; - __le32 authorizer_len; - __u8 flags; -} __attribute__ ((packed)); - -#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */ - - -/* - * message header - */ -struct ceph_msg_header_old { - __le64 seq; /* message seq# for this session */ - __le64 tid; /* transaction id */ - __le16 type; /* message type */ - __le16 priority; /* priority. higher value == higher priority */ - __le16 version; /* version of message encoding */ - - __le32 front_len; /* bytes in main payload */ - __le32 middle_len;/* bytes in middle payload */ - __le32 data_len; /* bytes of data payload */ - __le16 data_off; /* sender: include full offset; - receiver: mask against ~PAGE_MASK */ - - struct ceph_entity_inst src, orig_src; - __le32 reserved; - __le32 crc; /* header crc32c */ -} __attribute__ ((packed)); - -struct ceph_msg_header { - __le64 seq; /* message seq# for this session */ - __le64 tid; /* transaction id */ - __le16 type; /* message type */ - __le16 priority; /* priority. higher value == higher priority */ - __le16 version; /* version of message encoding */ - - __le32 front_len; /* bytes in main payload */ - __le32 middle_len;/* bytes in middle payload */ - __le32 data_len; /* bytes of data payload */ - __le16 data_off; /* sender: include full offset; - receiver: mask against ~PAGE_MASK */ - - struct ceph_entity_name src; - __le32 reserved; - __le32 crc; /* header crc32c */ -} __attribute__ ((packed)); - -#define CEPH_MSG_PRIO_LOW 64 -#define CEPH_MSG_PRIO_DEFAULT 127 -#define CEPH_MSG_PRIO_HIGH 196 -#define CEPH_MSG_PRIO_HIGHEST 255 - -/* - * follows data payload - */ -struct ceph_msg_footer { - __le32 front_crc, middle_crc, data_crc; - __u8 flags; -} __attribute__ ((packed)); - -#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ -#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ - - -#endif diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c deleted file mode 100644 index 92b7251a53f..00000000000 --- a/fs/ceph/osd_client.c +++ /dev/null @@ -1,1542 +0,0 @@ -#include "ceph_debug.h" - -#include <linux/err.h> -#include <linux/highmem.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/uaccess.h> - -#include "super.h" -#include "osd_client.h" -#include "messenger.h" -#include "decode.h" -#include "auth.h" - -#define OSD_OP_FRONT_LEN 4096 -#define OSD_OPREPLY_FRONT_LEN 512 - -static const struct ceph_connection_operations osd_con_ops; -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd); - -static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); - -/* - * Implement client access to distributed object storage cluster. - * - * All data objects are stored within a cluster/cloud of OSDs, or - * "object storage devices." (Note that Ceph OSDs have _nothing_ to - * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply - * remote daemons serving up and coordinating consistent and safe - * access to storage. - * - * Cluster membership and the mapping of data objects onto storage devices - * are described by the osd map. - * - * We keep track of pending OSD requests (read, write), resubmit - * requests to different OSDs when the cluster topology/data layout - * change, or retry the affected requests when the communications - * channel with an OSD is reset. - */ - -/* - * calculate the mapping of a file extent onto an object, and fill out the - * request accordingly. shorten extent as necessary if it crosses an - * object boundary. - * - * fill osd op in request message. - */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req) -{ - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - struct ceph_osd_op *op = (void *)(reqhead + 1); - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ - u64 bno; - - reqhead->snapid = cpu_to_le64(vino.snap); - - /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, &bno, - &objoff, &objlen); - if (*plen < orig_len) - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); - - sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); - req->r_oid_len = strlen(req->r_oid); - - op->extent.offset = cpu_to_le64(objoff); - op->extent.length = cpu_to_le64(objlen); - req->r_num_pages = calc_pages_for(off, *plen); - - dout("calc_layout %s (%d) %llu~%llu (%d pages)\n", - req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages); -} - -/* - * requests - */ -void ceph_osdc_release_request(struct kref *kref) -{ - struct ceph_osd_request *req = container_of(kref, - struct ceph_osd_request, - r_kref); - - if (req->r_request) - ceph_msg_put(req->r_request); - if (req->r_reply) - ceph_msg_put(req->r_reply); - if (req->r_con_filling_msg) { - dout("release_request revoking pages %p from con %p\n", - req->r_pages, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, - req->r_reply); - ceph_con_put(req->r_con_filling_msg); - } - if (req->r_own_pages) - ceph_release_page_vector(req->r_pages, - req->r_num_pages); - ceph_put_snap_context(req->r_snapc); - if (req->r_mempool) - mempool_free(req, req->r_osdc->req_mempool); - else - kfree(req); -} - -/* - * build new request AND message, calculate layout, and adjust file - * extent as needed. - * - * if the file was recently truncated, we include information about its - * old and new size so that the object can be updated appropriately. (we - * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. - */ -struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, - struct ceph_vino vino, - u64 off, u64 *plen, - int opcode, int flags, - struct ceph_snap_context *snapc, - int do_sync, - u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int num_reply) -{ - struct ceph_osd_request *req; - struct ceph_msg *msg; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - void *p; - int num_op = 1 + do_sync; - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); - int i; - - if (use_mempool) { - req = mempool_alloc(osdc->req_mempool, GFP_NOFS); - memset(req, 0, sizeof(*req)); - } else { - req = kzalloc(sizeof(*req), GFP_NOFS); - } - if (req == NULL) - return NULL; - - req->r_osdc = osdc; - req->r_mempool = use_mempool; - kref_init(&req->r_kref); - init_completion(&req->r_completion); - init_completion(&req->r_safe_completion); - INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - - /* create reply message */ - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, GFP_NOFS); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - req->r_reply = msg; - - /* create request message; allow space for oid */ - msg_size += 40; - if (snapc) - msg_size += sizeof(u64) * snapc->num_snaps; - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); - memset(msg->front.iov_base, 0, msg->front.iov_len); - head = msg->front.iov_base; - op = (void *)(head + 1); - p = (void *)(op + num_op); - - req->r_request = msg; - req->r_snapc = ceph_get_snap_context(snapc); - - head->client_inc = cpu_to_le32(1); /* always, for now. */ - head->flags = cpu_to_le32(flags); - if (flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(&head->mtime, mtime); - head->num_ops = cpu_to_le16(num_op); - op->op = cpu_to_le16(opcode); - - /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req); - req->r_file_layout = *layout; /* keep a copy */ - - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen); - op->payload_len = cpu_to_le32(*plen); - } - op->extent.truncate_size = cpu_to_le64(truncate_size); - op->extent.truncate_seq = cpu_to_le32(truncate_seq); - - /* fill in oid */ - head->object_len = cpu_to_le32(req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - p += req->r_oid_len; - - if (do_sync) { - op++; - op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC); - } - if (snapc) { - head->snap_seq = cpu_to_le64(snapc->seq); - head->num_snaps = cpu_to_le32(snapc->num_snaps); - for (i = 0; i < snapc->num_snaps; i++) { - put_unaligned_le64(snapc->snaps[i], p); - p += sizeof(u64); - } - } - - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); - return req; -} - -/* - * We keep osd requests in an rbtree, sorted by ->r_tid. - */ -static void __insert_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *new) -{ - struct rb_node **p = &osdc->requests.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_osd_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &osdc->requests); -} - -static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, - u64 tid) -{ - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else - return req; - } - return NULL; -} - -static struct ceph_osd_request * -__lookup_request_ge(struct ceph_osd_client *osdc, - u64 tid) -{ - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) { - if (!n->rb_left) - return req; - n = n->rb_left; - } else if (tid > req->r_tid) { - n = n->rb_right; - } else { - return req; - } - } - return NULL; -} - - -/* - * If the osd connection drops, we need to resubmit all requests. - */ -static void osd_reset(struct ceph_connection *con) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - - if (!osd) - return; - dout("osd_reset osd%d\n", osd->o_osd); - osdc = osd->o_osdc; - down_read(&osdc->map_sem); - kick_requests(osdc, osd); - up_read(&osdc->map_sem); -} - -/* - * Track open sessions with osds. - */ -static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) -{ - struct ceph_osd *osd; - - osd = kzalloc(sizeof(*osd), GFP_NOFS); - if (!osd) - return NULL; - - atomic_set(&osd->o_ref, 1); - osd->o_osdc = osdc; - INIT_LIST_HEAD(&osd->o_requests); - INIT_LIST_HEAD(&osd->o_osd_lru); - osd->o_incarnation = 1; - - ceph_con_init(osdc->client->msgr, &osd->o_con); - osd->o_con.private = osd; - osd->o_con.ops = &osd_con_ops; - osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD; - - INIT_LIST_HEAD(&osd->o_keepalive_item); - return osd; -} - -static struct ceph_osd *get_osd(struct ceph_osd *osd) -{ - if (atomic_inc_not_zero(&osd->o_ref)) { - dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, - atomic_read(&osd->o_ref)); - return osd; - } else { - dout("get_osd %p FAIL\n", osd); - return NULL; - } -} - -static void put_osd(struct ceph_osd *osd) -{ - dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), - atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) { - struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - - if (osd->o_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_authorizer); - kfree(osd); - } -} - -/* - * remove an osd from our map - */ -static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - dout("__remove_osd %p\n", osd); - BUG_ON(!list_empty(&osd->o_requests)); - rb_erase(&osd->o_node, &osdc->osds); - list_del_init(&osd->o_osd_lru); - ceph_con_close(&osd->o_con); - put_osd(osd); -} - -static void __move_osd_to_lru(struct ceph_osd_client *osdc, - struct ceph_osd *osd) -{ - dout("__move_osd_to_lru %p\n", osd); - BUG_ON(!list_empty(&osd->o_osd_lru)); - list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); - osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; -} - -static void __remove_osd_from_lru(struct ceph_osd *osd) -{ - dout("__remove_osd_from_lru %p\n", osd); - if (!list_empty(&osd->o_osd_lru)) - list_del_init(&osd->o_osd_lru); -} - -static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) -{ - struct ceph_osd *osd, *nosd; - - dout("__remove_old_osds %p\n", osdc); - mutex_lock(&osdc->request_mutex); - list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { - if (!remove_all && time_before(jiffies, osd->lru_ttl)) - break; - __remove_osd(osdc, osd); - } - mutex_unlock(&osdc->request_mutex); -} - -/* - * reset osd connect - */ -static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - struct ceph_osd_request *req; - int ret = 0; - - dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests)) { - __remove_osd(osdc, osd); - } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], - &osd->o_con.peer_addr, - sizeof(osd->o_con.peer_addr)) == 0 && - !ceph_con_opened(&osd->o_con)) { - dout(" osd addr hasn't changed and connection never opened," - " letting msgr retry"); - /* touch each r_stamp for handle_timeout()'s benfit */ - list_for_each_entry(req, &osd->o_requests, r_osd_item) - req->r_stamp = jiffies; - ret = -EAGAIN; - } else { - ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); - osd->o_incarnation++; - } - return ret; -} - -static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) -{ - struct rb_node **p = &osdc->osds.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd *osd = NULL; - - while (*p) { - parent = *p; - osd = rb_entry(parent, struct ceph_osd, o_node); - if (new->o_osd < osd->o_osd) - p = &(*p)->rb_left; - else if (new->o_osd > osd->o_osd) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->o_node, parent, p); - rb_insert_color(&new->o_node, &osdc->osds); -} - -static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) -{ - struct ceph_osd *osd; - struct rb_node *n = osdc->osds.rb_node; - - while (n) { - osd = rb_entry(n, struct ceph_osd, o_node); - if (o < osd->o_osd) - n = n->rb_left; - else if (o > osd->o_osd) - n = n->rb_right; - else - return osd; - } - return NULL; -} - -static void __schedule_osd_timeout(struct ceph_osd_client *osdc) -{ - schedule_delayed_work(&osdc->timeout_work, - osdc->client->mount_args->osd_keepalive_timeout * HZ); -} - -static void __cancel_osd_timeout(struct ceph_osd_client *osdc) -{ - cancel_delayed_work(&osdc->timeout_work); -} - -/* - * Register request, assign tid. If this is the first request, set up - * the timeout event. - */ -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - req->r_tid = ++osdc->last_tid; - req->r_request->hdr.tid = cpu_to_le64(req->r_tid); - INIT_LIST_HEAD(&req->r_req_lru_item); - - dout("register_request %p tid %lld\n", req, req->r_tid); - __insert_request(osdc, req); - ceph_osdc_get_request(req); - osdc->num_requests++; - - if (osdc->num_requests == 1) { - dout(" first request, scheduling timeout\n"); - __schedule_osd_timeout(osdc); - } - mutex_unlock(&osdc->request_mutex); -} - -/* - * called under osdc->request_mutex - */ -static void __unregister_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - dout("__unregister_request %p tid %lld\n", req, req->r_tid); - rb_erase(&req->r_node, &osdc->requests); - osdc->num_requests--; - - if (req->r_osd) { - /* make sure the original request isn't in flight. */ - ceph_con_revoke(&req->r_osd->o_con, req->r_request); - - list_del_init(&req->r_osd_item); - if (list_empty(&req->r_osd->o_requests)) - __move_osd_to_lru(osdc, req->r_osd); - req->r_osd = NULL; - } - - ceph_osdc_put_request(req); - - list_del_init(&req->r_req_lru_item); - if (osdc->num_requests == 0) { - dout(" no requests, canceling timeout\n"); - __cancel_osd_timeout(osdc); - } -} - -/* - * Cancel a previously queued request message - */ -static void __cancel_request(struct ceph_osd_request *req) -{ - if (req->r_sent) { - ceph_con_revoke(&req->r_osd->o_con, req->r_request); - req->r_sent = 0; - } - list_del_init(&req->r_req_lru_item); -} - -/* - * Pick an osd (the first 'up' osd in the pg), allocate the osd struct - * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. - * - * Return 0 if unchanged, 1 if changed, or negative on error. - * - * Caller should hold map_sem for read and request_mutex. - */ -static int __map_osds(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; - struct ceph_pg pgid; - int acting[CEPH_PG_MAX_SIZE]; - int o = -1, num = 0; - int err; - - dout("map_osds %p tid %lld\n", req, req->r_tid); - err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, - &req->r_file_layout, osdc->osdmap); - if (err) - return err; - pgid = reqhead->layout.ol_pgid; - req->r_pgid = pgid; - - err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); - if (err > 0) { - o = acting[0]; - num = err; - } - - if ((req->r_osd && req->r_osd->o_osd == o && - req->r_sent >= req->r_osd->o_incarnation && - req->r_num_pg_osds == num && - memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1)) - return 0; /* no change */ - - dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", - req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, - req->r_osd ? req->r_osd->o_osd : -1); - - /* record full pg acting set */ - memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); - req->r_num_pg_osds = num; - - if (req->r_osd) { - __cancel_request(req); - list_del_init(&req->r_osd_item); - req->r_osd = NULL; - } - - req->r_osd = __lookup_osd(osdc, o); - if (!req->r_osd && o >= 0) { - err = -ENOMEM; - req->r_osd = create_osd(osdc); - if (!req->r_osd) - goto out; - - dout("map_osds osd %p is osd%d\n", req->r_osd, o); - req->r_osd->o_osd = o; - req->r_osd->o_con.peer_name.num = cpu_to_le64(o); - __insert_osd(osdc, req->r_osd); - - ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); - } - - if (req->r_osd) { - __remove_osd_from_lru(req->r_osd); - list_add(&req->r_osd_item, &req->r_osd->o_requests); - } - err = 1; /* osd or pg changed */ - -out: - return err; -} - -/* - * caller should hold map_sem (for read) and request_mutex - */ -static int __send_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - struct ceph_osd_request_head *reqhead; - int err; - - err = __map_osds(osdc, req); - if (err < 0) - return err; - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - return 0; - } - - dout("send_request %p tid %llu to osd%d flags %d\n", - req, req->r_tid, req->r_osd->o_osd, req->r_flags); - - reqhead = req->r_request->front.iov_base; - reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); - reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ - reqhead->reassert_version = req->r_reassert_version; - - req->r_stamp = jiffies; - list_move_tail(&osdc->req_lru, &req->r_req_lru_item); - - ceph_msg_get(req->r_request); /* send consumes a ref */ - ceph_con_send(&req->r_osd->o_con, req->r_request); - req->r_sent = req->r_osd->o_incarnation; - return 0; -} - -/* - * Timeout callback, called every N seconds when 1 or more osd - * requests has been active for more than N seconds. When this - * happens, we ping all OSDs with requests who have timed out to - * ensure any communications channel reset is detected. Reset the - * request timeouts another N seconds in the future as we go. - * Reschedule the timeout event another N seconds in future (unless - * there are no open requests). - */ -static void handle_timeout(struct work_struct *work) -{ - struct ceph_osd_client *osdc = - container_of(work, struct ceph_osd_client, timeout_work.work); - struct ceph_osd_request *req, *last_req = NULL; - struct ceph_osd *osd; - unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; - unsigned long keepalive = - osdc->client->mount_args->osd_keepalive_timeout * HZ; - unsigned long last_stamp = 0; - struct rb_node *p; - struct list_head slow_osds; - - dout("timeout\n"); - down_read(&osdc->map_sem); - - ceph_monc_request_next_osdmap(&osdc->client->monc); - - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - int err; - - dout("osdc resending prev failed %lld\n", req->r_tid); - err = __send_request(osdc, req); - if (err) - dout("osdc failed again on %lld\n", req->r_tid); - else - req->r_resend = false; - continue; - } - } - - /* - * reset osds that appear to be _really_ unresponsive. this - * is a failsafe measure.. we really shouldn't be getting to - * this point if the system is working properly. the monitors - * should mark the osd as failed and we should find out about - * it from an updated osd map. - */ - while (timeout && !list_empty(&osdc->req_lru)) { - req = list_entry(osdc->req_lru.next, struct ceph_osd_request, - r_req_lru_item); - - if (time_before(jiffies, req->r_stamp + timeout)) - break; - - BUG_ON(req == last_req && req->r_stamp == last_stamp); - last_req = req; - last_stamp = req->r_stamp; - - osd = req->r_osd; - BUG_ON(!osd); - pr_warning(" tid %llu timed out on osd%d, will reset osd\n", - req->r_tid, osd->o_osd); - __kick_requests(osdc, osd); - } - - /* - * ping osds that are a bit slow. this ensures that if there - * is a break in the TCP connection we will notice, and reopen - * a connection with that osd (from the fault callback). - */ - INIT_LIST_HEAD(&slow_osds); - list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, req->r_stamp + keepalive)) - break; - - osd = req->r_osd; - BUG_ON(!osd); - dout(" tid %llu is slow, will send keepalive on osd%d\n", - req->r_tid, osd->o_osd); - list_move_tail(&osd->o_keepalive_item, &slow_osds); - } - while (!list_empty(&slow_osds)) { - osd = list_entry(slow_osds.next, struct ceph_osd, - o_keepalive_item); - list_del_init(&osd->o_keepalive_item); - ceph_con_keepalive(&osd->o_con); - } - - __schedule_osd_timeout(osdc); - mutex_unlock(&osdc->request_mutex); - - up_read(&osdc->map_sem); -} - -static void handle_osds_timeout(struct work_struct *work) -{ - struct ceph_osd_client *osdc = - container_of(work, struct ceph_osd_client, - osds_timeout_work.work); - unsigned long delay = - osdc->client->mount_args->osd_idle_ttl * HZ >> 2; - - dout("osds timeout\n"); - down_read(&osdc->map_sem); - remove_old_osds(osdc, 0); - up_read(&osdc->map_sem); - - schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(delay)); -} - -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, - struct ceph_connection *con) -{ - struct ceph_osd_reply_head *rhead = msg->front.iov_base; - struct ceph_osd_request *req; - u64 tid; - int numops, object_len, flags; - s32 result; - - tid = le64_to_cpu(msg->hdr.tid); - if (msg->front.iov_len < sizeof(*rhead)) - goto bad; - numops = le32_to_cpu(rhead->num_ops); - object_len = le32_to_cpu(rhead->object_len); - result = le32_to_cpu(rhead->result); - if (msg->front.iov_len != sizeof(*rhead) + object_len + - numops * sizeof(struct ceph_osd_op)) - goto bad; - dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); - - /* lookup */ - mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); - if (req == NULL) { - dout("handle_reply tid %llu dne\n", tid); - mutex_unlock(&osdc->request_mutex); - return; - } - ceph_osdc_get_request(req); - flags = le32_to_cpu(rhead->flags); - - /* - * if this connection filled our message, drop our reference now, to - * avoid a (safe but slower) revoke later. - */ - if (req->r_con_filling_msg == con && req->r_reply == msg) { - dout(" dropping con_filling_msg ref %p\n", con); - req->r_con_filling_msg = NULL; - ceph_con_put(con); - } - - if (!req->r_got_reply) { - unsigned bytes; - - req->r_result = le32_to_cpu(rhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); - dout("handle_reply result %d bytes %d\n", req->r_result, - bytes); - if (req->r_result == 0) - req->r_result = bytes; - - /* in case this is a write and we need to replay, */ - req->r_reassert_version = rhead->reassert_version; - - req->r_got_reply = 1; - } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { - dout("handle_reply tid %llu dup ack\n", tid); - mutex_unlock(&osdc->request_mutex); - goto done; - } - - dout("handle_reply tid %llu flags %d\n", tid, flags); - - /* either this is a read, or we got the safe response */ - if (result < 0 || - (flags & CEPH_OSD_FLAG_ONDISK) || - ((flags & CEPH_OSD_FLAG_WRITE) == 0)) - __unregister_request(osdc, req); - - mutex_unlock(&osdc->request_mutex); - - if (req->r_callback) - req->r_callback(req, msg); - else - complete(&req->r_completion); - - if (flags & CEPH_OSD_FLAG_ONDISK) { - if (req->r_safe_callback) - req->r_safe_callback(req, msg); - complete(&req->r_safe_completion); /* fsync waiter */ - } - -done: - ceph_osdc_put_request(req); - return; - -bad: - pr_err("corrupt osd_op_reply got %d %d expected %d\n", - (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), - (int)sizeof(*rhead)); - ceph_msg_dump(msg); -} - - -static int __kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - struct ceph_osd_request *req; - struct rb_node *p, *n; - int needmap = 0; - int err; - - dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); - if (kickosd) { - err = __reset_osd(osdc, kickosd); - if (err == -EAGAIN) - return 1; - } else { - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = - rb_entry(p, struct ceph_osd, o_node); - - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); - } - } - - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_osd_request, r_node); - - if (req->r_resend) { - dout(" r_resend set on tid %llu\n", req->r_tid); - __cancel_request(req); - goto kick; - } - if (req->r_osd && kickosd == req->r_osd) { - __cancel_request(req); - goto kick; - } - - err = __map_osds(osdc, req); - if (err == 0) - continue; /* no change */ - if (err < 0) { - /* - * FIXME: really, we should set the request - * error and fail if this isn't a 'nofail' - * request, but that's a fair bit more - * complicated to do. So retry! - */ - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - continue; - } - if (req->r_osd == NULL) { - dout("tid %llu maps to no valid osd\n", req->r_tid); - needmap++; /* request a newer map */ - continue; - } - -kick: - dout("kicking %p tid %llu osd%d\n", req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - err = __send_request(osdc, req); - if (err) { - dout(" setting r_resend on %llu\n", req->r_tid); - req->r_resend = true; - } - } - - return needmap; -} - -/* - * Resubmit osd requests whose osd or osd address has changed. Request - * a new osd map if osds are down, or we are otherwise unable to determine - * how to direct a request. - * - * Close connections to down osds. - * - * If @who is specified, resubmit requests for that specific osd. - * - * Caller should hold map_sem for read and request_mutex. - */ -static void kick_requests(struct ceph_osd_client *osdc, - struct ceph_osd *kickosd) -{ - int needmap; - - mutex_lock(&osdc->request_mutex); - needmap = __kick_requests(osdc, kickosd); - mutex_unlock(&osdc->request_mutex); - - if (needmap) { - dout("%d requests for down osds, need new map\n", needmap); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } - -} -/* - * Process updated osd map. - * - * The message contains any number of incremental and full maps, normally - * indicating some sort of topology change in the cluster. Kick requests - * off to different OSDs as needed. - */ -void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) -{ - void *p, *end, *next; - u32 nr_maps, maplen; - u32 epoch; - struct ceph_osdmap *newmap = NULL, *oldmap; - int err; - struct ceph_fsid fsid; - - dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); - p = msg->front.iov_base; - end = p + msg->front.iov_len; - - /* verify fsid */ - ceph_decode_need(&p, end, sizeof(fsid), bad); - ceph_decode_copy(&p, &fsid, sizeof(fsid)); - if (ceph_check_fsid(osdc->client, &fsid) < 0) - return; - - down_write(&osdc->map_sem); - - /* incremental maps */ - ceph_decode_32_safe(&p, end, nr_maps, bad); - dout(" %d inc maps\n", nr_maps); - while (nr_maps > 0) { - ceph_decode_need(&p, end, 2*sizeof(u32), bad); - epoch = ceph_decode_32(&p); - maplen = ceph_decode_32(&p); - ceph_decode_need(&p, end, maplen, bad); - next = p + maplen; - if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { - dout("applying incremental map %u len %d\n", - epoch, maplen); - newmap = osdmap_apply_incremental(&p, next, - osdc->osdmap, - osdc->client->msgr); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); - goto bad; - } - BUG_ON(!newmap); - if (newmap != osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = newmap; - } - } else { - dout("ignoring incremental map %u len %d\n", - epoch, maplen); - } - p = next; - nr_maps--; - } - if (newmap) - goto done; - - /* full maps */ - ceph_decode_32_safe(&p, end, nr_maps, bad); - dout(" %d full maps\n", nr_maps); - while (nr_maps) { - ceph_decode_need(&p, end, 2*sizeof(u32), bad); - epoch = ceph_decode_32(&p); - maplen = ceph_decode_32(&p); - ceph_decode_need(&p, end, maplen, bad); - if (nr_maps > 1) { - dout("skipping non-latest full map %u len %d\n", - epoch, maplen); - } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { - dout("skipping full map %u len %d, " - "older than our %u\n", epoch, maplen, - osdc->osdmap->epoch); - } else { - dout("taking full map %u len %d\n", epoch, maplen); - newmap = osdmap_decode(&p, p+maplen); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); - goto bad; - } - BUG_ON(!newmap); - oldmap = osdc->osdmap; - osdc->osdmap = newmap; - if (oldmap) - ceph_osdmap_destroy(oldmap); - } - p += maplen; - nr_maps--; - } - -done: - downgrade_write(&osdc->map_sem); - ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); - if (newmap) - kick_requests(osdc, NULL); - up_read(&osdc->map_sem); - wake_up(&osdc->client->auth_wq); - return; - -bad: - pr_err("osdc handle_map corrupt msg\n"); - ceph_msg_dump(msg); - up_write(&osdc->map_sem); - return; -} - -/* - * Register request, send initial attempt. - */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) -{ - int rc = 0; - - req->r_request->pages = req->r_pages; - req->r_request->nr_pages = req->r_num_pages; - - register_request(osdc, req); - - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - /* - * a racing kick_requests() may have sent the message for us - * while we dropped request_mutex above, so only send now if - * the request still han't been touched yet. - */ - if (req->r_sent == 0) { - rc = __send_request(osdc, req); - if (rc) { - if (nofail) { - dout("osdc_start_request failed send, " - " marking %lld\n", req->r_tid); - req->r_resend = true; - rc = 0; - } else { - __unregister_request(osdc, req); - } - } - } - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); - return rc; -} - -/* - * wait for a request to complete - */ -int ceph_osdc_wait_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - int rc; - - rc = wait_for_completion_interruptible(&req->r_completion); - if (rc < 0) { - mutex_lock(&osdc->request_mutex); - __cancel_request(req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); - dout("wait_request tid %llu canceled/timed out\n", req->r_tid); - return rc; - } - - dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); - return req->r_result; -} - -/* - * sync - wait for all in-flight requests to flush. avoid starvation. - */ -void ceph_osdc_sync(struct ceph_osd_client *osdc) -{ - struct ceph_osd_request *req; - u64 last_tid, next_tid = 0; - - mutex_lock(&osdc->request_mutex); - last_tid = osdc->last_tid; - while (1) { - req = __lookup_request_ge(osdc, next_tid); - if (!req) - break; - if (req->r_tid > last_tid) - break; - - next_tid = req->r_tid + 1; - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) - continue; - - ceph_osdc_get_request(req); - mutex_unlock(&osdc->request_mutex); - dout("sync waiting on tid %llu (last is %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); - mutex_lock(&osdc->request_mutex); - ceph_osdc_put_request(req); - } - mutex_unlock(&osdc->request_mutex); - dout("sync done (thru tid %llu)\n", last_tid); -} - -/* - * init, shutdown - */ -int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) -{ - int err; - - dout("init\n"); - osdc->client = client; - osdc->osdmap = NULL; - init_rwsem(&osdc->map_sem); - init_completion(&osdc->map_waiters); - osdc->last_requested_map = 0; - mutex_init(&osdc->request_mutex); - osdc->last_tid = 0; - osdc->osds = RB_ROOT; - INIT_LIST_HEAD(&osdc->osd_lru); - osdc->requests = RB_ROOT; - INIT_LIST_HEAD(&osdc->req_lru); - osdc->num_requests = 0; - INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); - INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); - - schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); - - err = -ENOMEM; - osdc->req_mempool = mempool_create_kmalloc_pool(10, - sizeof(struct ceph_osd_request)); - if (!osdc->req_mempool) - goto out; - - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, - "osd_op"); - if (err < 0) - goto out_mempool; - err = ceph_msgpool_init(&osdc->msgpool_op_reply, - OSD_OPREPLY_FRONT_LEN, 10, true, - "osd_op_reply"); - if (err < 0) - goto out_msgpool; - return 0; - -out_msgpool: - ceph_msgpool_destroy(&osdc->msgpool_op); -out_mempool: - mempool_destroy(osdc->req_mempool); -out: - return err; -} - -void ceph_osdc_stop(struct ceph_osd_client *osdc) -{ - cancel_delayed_work_sync(&osdc->timeout_work); - cancel_delayed_work_sync(&osdc->osds_timeout_work); - if (osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = NULL; - } - remove_old_osds(osdc, 1); - mempool_destroy(osdc->req_mempool); - ceph_msgpool_destroy(&osdc->msgpool_op); - ceph_msgpool_destroy(&osdc->msgpool_op_reply); -} - -/* - * Read some contiguous pages. If we cross a stripe boundary, shorten - * *plen. Return number of bytes read, or error. - */ -int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct ceph_vino vino, struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages) -{ - struct ceph_osd_request *req; - int rc = 0; - - dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, - vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, truncate_seq, truncate_size, NULL, - false, 1); - if (!req) - return -ENOMEM; - - /* it may be a short read due to an object boundary */ - req->r_pages = pages; - num_pages = calc_pages_for(off, *plen); - req->r_num_pages = num_pages; - - dout("readpages final extent is %llu~%llu (%d pages)\n", - off, *plen, req->r_num_pages); - - rc = ceph_osdc_start_request(osdc, req, false); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_osdc_put_request(req); - dout("readpages result %d\n", rc); - return rc; -} - -/* - * do a synchronous write on N pages - */ -int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *snapc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - struct page **pages, int num_pages, - int flags, int do_sync, bool nofail) -{ - struct ceph_osd_request *req; - int rc = 0; - - BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, - CEPH_OSD_OP_WRITE, - flags | CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE, - snapc, do_sync, - truncate_seq, truncate_size, mtime, - nofail, 1); - if (!req) - return -ENOMEM; - - /* it may be a short write due to an object boundary */ - req->r_pages = pages; - req->r_num_pages = calc_pages_for(off, len); - dout("writepages %llu~%llu (%d pages)\n", off, len, - req->r_num_pages); - - rc = ceph_osdc_start_request(osdc, req, nofail); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_osdc_put_request(req); - if (rc == 0) - rc = len; - dout("writepages result %d\n", rc); - return rc; -} - -/* - * handle incoming message - */ -static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - int type = le16_to_cpu(msg->hdr.type); - - if (!osd) - goto out; - osdc = osd->o_osdc; - - switch (type) { - case CEPH_MSG_OSD_MAP: - ceph_osdc_handle_map(osdc, msg); - break; - case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg, con); - break; - - default: - pr_err("received unknown message type %d %s\n", type, - ceph_msg_type_name(type)); - } -out: - ceph_msg_put(msg); -} - -/* - * lookup and return message for incoming reply. set up reply message - * pages. - */ -static struct ceph_msg *get_reply(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc = osd->o_osdc; - struct ceph_msg *m; - struct ceph_osd_request *req; - int front = le32_to_cpu(hdr->front_len); - int data_len = le32_to_cpu(hdr->data_len); - u64 tid; - - tid = le64_to_cpu(hdr->tid); - mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); - if (!req) { - *skip = 1; - m = NULL; - pr_info("get_reply unknown tid %llu from osd%d\n", tid, - osd->o_osd); - goto out; - } - - if (req->r_con_filling_msg) { - dout("get_reply revoking msg %p from old con %p\n", - req->r_reply, req->r_con_filling_msg); - ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); - ceph_con_put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } - - if (front > req->r_reply->front.iov_len) { - pr_warning("get_reply front %d > preallocated %d\n", - front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); - if (!m) - goto out; - ceph_msg_put(req->r_reply); - req->r_reply = m; - } - m = ceph_msg_get(req->r_reply); - - if (data_len > 0) { - unsigned data_off = le16_to_cpu(hdr->data_off); - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - - if (unlikely(req->r_num_pages < want)) { - pr_warning("tid %lld reply %d > expected %d pages\n", - tid, want, m->nr_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; - } - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - } - *skip = 0; - req->r_con_filling_msg = ceph_con_get(con); - dout("get_reply tid %lld %p\n", tid, m); - -out: - mutex_unlock(&osdc->request_mutex); - return m; - -} - -static struct ceph_msg *alloc_msg(struct ceph_connection *con, - struct ceph_msg_header *hdr, - int *skip) -{ - struct ceph_osd *osd = con->private; - int type = le16_to_cpu(hdr->type); - int front = le32_to_cpu(hdr->front_len); - - switch (type) { - case CEPH_MSG_OSD_MAP: - return ceph_msg_new(type, front, GFP_NOFS); - case CEPH_MSG_OSD_OPREPLY: - return get_reply(con, hdr, skip); - default: - pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, - osd->o_osd); - *skip = 1; - return NULL; - } -} - -/* - * Wrappers to refcount containing ceph_osd struct - */ -static struct ceph_connection *get_osd_con(struct ceph_connection *con) -{ - struct ceph_osd *osd = con->private; - if (get_osd(osd)) - return con; - return NULL; -} - -static void put_osd_con(struct ceph_connection *con) -{ - struct ceph_osd *osd = con->private; - put_osd(osd); -} - -/* - * authentication - */ -static int get_authorizer(struct ceph_connection *con, - void **buf, int *len, int *proto, - void **reply_buf, int *reply_len, int force_new) -{ - struct ceph_osd *o = con->private; - struct ceph_osd_client *osdc = o->o_osdc; - struct ceph_auth_client *ac = osdc->client->monc.auth; - int ret = 0; - - if (force_new && o->o_authorizer) { - ac->ops->destroy_authorizer(ac, o->o_authorizer); - o->o_authorizer = NULL; - } - if (o->o_authorizer == NULL) { - ret = ac->ops->create_authorizer( - ac, CEPH_ENTITY_TYPE_OSD, - &o->o_authorizer, - &o->o_authorizer_buf, - &o->o_authorizer_buf_len, - &o->o_authorizer_reply_buf, - &o->o_authorizer_reply_buf_len); - if (ret) - return ret; - } - - *proto = ac->protocol; - *buf = o->o_authorizer_buf; - *len = o->o_authorizer_buf_len; - *reply_buf = o->o_authorizer_reply_buf; - *reply_len = o->o_authorizer_reply_buf_len; - return 0; -} - - -static int verify_authorizer_reply(struct ceph_connection *con, int len) -{ - struct ceph_osd *o = con->private; - struct ceph_osd_client *osdc = o->o_osdc; - struct ceph_auth_client *ac = osdc->client->monc.auth; - - return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len); -} - -static int invalidate_authorizer(struct ceph_connection *con) -{ - struct ceph_osd *o = con->private; - struct ceph_osd_client *osdc = o->o_osdc; - struct ceph_auth_client *ac = osdc->client->monc.auth; - - if (ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - - return ceph_monc_validate_auth(&osdc->client->monc); -} - -static const struct ceph_connection_operations osd_con_ops = { - .get = get_osd_con, - .put = put_osd_con, - .dispatch = dispatch, - .get_authorizer = get_authorizer, - .verify_authorizer_reply = verify_authorizer_reply, - .invalidate_authorizer = invalidate_authorizer, - .alloc_msg = alloc_msg, - .fault = osd_reset, -}; diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h deleted file mode 100644 index ce776989ef6..00000000000 --- a/fs/ceph/osd_client.h +++ /dev/null @@ -1,167 +0,0 @@ -#ifndef _FS_CEPH_OSD_CLIENT_H -#define _FS_CEPH_OSD_CLIENT_H - -#include <linux/completion.h> -#include <linux/kref.h> -#include <linux/mempool.h> -#include <linux/rbtree.h> - -#include "types.h" -#include "osdmap.h" -#include "messenger.h" - -struct ceph_msg; -struct ceph_snap_context; -struct ceph_osd_request; -struct ceph_osd_client; -struct ceph_authorizer; - -/* - * completion callback for async writepages - */ -typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, - struct ceph_msg *); - -/* a given osd we're communicating with */ -struct ceph_osd { - atomic_t o_ref; - struct ceph_osd_client *o_osdc; - int o_osd; - int o_incarnation; - struct rb_node o_node; - struct ceph_connection o_con; - struct list_head o_requests; - struct list_head o_osd_lru; - struct ceph_authorizer *o_authorizer; - void *o_authorizer_buf, *o_authorizer_reply_buf; - size_t o_authorizer_buf_len, o_authorizer_reply_buf_len; - unsigned long lru_ttl; - int o_marked_for_keepalive; - struct list_head o_keepalive_item; -}; - -/* an in-flight request */ -struct ceph_osd_request { - u64 r_tid; /* unique for this client */ - struct rb_node r_node; - struct list_head r_req_lru_item; - struct list_head r_osd_item; - struct ceph_osd *r_osd; - struct ceph_pg r_pgid; - int r_pg_osds[CEPH_PG_MAX_SIZE]; - int r_num_pg_osds; - - struct ceph_connection *r_con_filling_msg; - - struct ceph_msg *r_request, *r_reply; - int r_result; - int r_flags; /* any additional flags for the osd */ - u32 r_sent; /* >0 if r_request is sending/sent */ - int r_got_reply; - - struct ceph_osd_client *r_osdc; - struct kref r_kref; - bool r_mempool; - struct completion r_completion, r_safe_completion; - ceph_osdc_callback_t r_callback, r_safe_callback; - struct ceph_eversion r_reassert_version; - struct list_head r_unsafe_item; - - struct inode *r_inode; /* for use by callbacks */ - - char r_oid[40]; /* object name */ - int r_oid_len; - unsigned long r_stamp; /* send OR check time */ - bool r_resend; /* msg send failed, needs retry */ - - struct ceph_file_layout r_file_layout; - struct ceph_snap_context *r_snapc; /* snap context for writes */ - unsigned r_num_pages; /* size of page array (follows) */ - struct page **r_pages; /* pages for data payload */ - int r_pages_from_pool; - int r_own_pages; /* if true, i own page list */ -}; - -struct ceph_osd_client { - struct ceph_client *client; - - struct ceph_osdmap *osdmap; /* current map */ - struct rw_semaphore map_sem; - struct completion map_waiters; - u64 last_requested_map; - - struct mutex request_mutex; - struct rb_root osds; /* osds */ - struct list_head osd_lru; /* idle osds */ - u64 timeout_tid; /* tid of timeout triggering rq */ - u64 last_tid; /* tid of last request */ - struct rb_root requests; /* pending requests */ - struct list_head req_lru; /* pending requests lru */ - int num_requests; - struct delayed_work timeout_work; - struct delayed_work osds_timeout_work; -#ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_file; -#endif - - mempool_t *req_mempool; - - struct ceph_msgpool msgpool_op; - struct ceph_msgpool msgpool_op_reply; -}; - -extern int ceph_osdc_init(struct ceph_osd_client *osdc, - struct ceph_client *client); -extern void ceph_osdc_stop(struct ceph_osd_client *osdc); - -extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, - struct ceph_msg *msg); -extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, - struct ceph_msg *msg); - -extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, - struct ceph_file_layout *layout, - struct ceph_vino vino, - u64 offset, u64 *len, int op, int flags, - struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, - bool use_mempool, int num_reply); - -static inline void ceph_osdc_get_request(struct ceph_osd_request *req) -{ - kref_get(&req->r_kref); -} -extern void ceph_osdc_release_request(struct kref *kref); -static inline void ceph_osdc_put_request(struct ceph_osd_request *req) -{ - kref_put(&req->r_kref, ceph_osdc_release_request); -} - -extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail); -extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -extern void ceph_osdc_sync(struct ceph_osd_client *osdc); - -extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int nr_pages); - -extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *sc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - struct page **pages, int nr_pages, - int flags, int do_sync, bool nofail); - -#endif - diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c deleted file mode 100644 index 277f8b33957..00000000000 --- a/fs/ceph/osdmap.c +++ /dev/null @@ -1,1083 +0,0 @@ - -#include "ceph_debug.h" - -#include <linux/slab.h> -#include <asm/div64.h> - -#include "super.h" -#include "osdmap.h" -#include "crush/hash.h" -#include "crush/mapper.h" -#include "decode.h" - -char *ceph_osdmap_state_str(char *str, int len, int state) -{ - int flag = 0; - - if (!len) - goto done; - - *str = '\0'; - if (state) { - if (state & CEPH_OSD_EXISTS) { - snprintf(str, len, "exists"); - flag = 1; - } - if (state & CEPH_OSD_UP) { - snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), - "up"); - flag = 1; - } - } else { - snprintf(str, len, "doesn't exist"); - } -done: - return str; -} - -/* maps */ - -static int calc_bits_of(unsigned t) -{ - int b = 0; - while (t) { - t = t >> 1; - b++; - } - return b; -} - -/* - * the foo_mask is the smallest value 2^n-1 that is >= foo. - */ -static void calc_pg_masks(struct ceph_pg_pool_info *pi) -{ - pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1; - pi->pgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1; - pi->lpg_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1; - pi->lpgp_num_mask = - (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1; -} - -/* - * decode crush map - */ -static int crush_decode_uniform_bucket(void **p, void *end, - struct crush_bucket_uniform *b) -{ - dout("crush_decode_uniform_bucket %p to %p\n", *p, end); - ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); - b->item_weight = ceph_decode_32(p); - return 0; -bad: - return -EINVAL; -} - -static int crush_decode_list_bucket(void **p, void *end, - struct crush_bucket_list *b) -{ - int j; - dout("crush_decode_list_bucket %p to %p\n", *p, end); - b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); - if (b->item_weights == NULL) - return -ENOMEM; - b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); - if (b->sum_weights == NULL) - return -ENOMEM; - ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); - for (j = 0; j < b->h.size; j++) { - b->item_weights[j] = ceph_decode_32(p); - b->sum_weights[j] = ceph_decode_32(p); - } - return 0; -bad: - return -EINVAL; -} - -static int crush_decode_tree_bucket(void **p, void *end, - struct crush_bucket_tree *b) -{ - int j; - dout("crush_decode_tree_bucket %p to %p\n", *p, end); - ceph_decode_32_safe(p, end, b->num_nodes, bad); - b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); - if (b->node_weights == NULL) - return -ENOMEM; - ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); - for (j = 0; j < b->num_nodes; j++) - b->node_weights[j] = ceph_decode_32(p); - return 0; -bad: - return -EINVAL; -} - -static int crush_decode_straw_bucket(void **p, void *end, - struct crush_bucket_straw *b) -{ - int j; - dout("crush_decode_straw_bucket %p to %p\n", *p, end); - b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); - if (b->item_weights == NULL) - return -ENOMEM; - b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); - if (b->straws == NULL) - return -ENOMEM; - ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); - for (j = 0; j < b->h.size; j++) { - b->item_weights[j] = ceph_decode_32(p); - b->straws[j] = ceph_decode_32(p); - } - return 0; -bad: - return -EINVAL; -} - -static struct crush_map *crush_decode(void *pbyval, void *end) -{ - struct crush_map *c; - int err = -EINVAL; - int i, j; - void **p = &pbyval; - void *start = pbyval; - u32 magic; - - dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - - c = kzalloc(sizeof(*c), GFP_NOFS); - if (c == NULL) - return ERR_PTR(-ENOMEM); - - ceph_decode_need(p, end, 4*sizeof(u32), bad); - magic = ceph_decode_32(p); - if (magic != CRUSH_MAGIC) { - pr_err("crush_decode magic %x != current %x\n", - (unsigned)magic, (unsigned)CRUSH_MAGIC); - goto bad; - } - c->max_buckets = ceph_decode_32(p); - c->max_rules = ceph_decode_32(p); - c->max_devices = ceph_decode_32(p); - - c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS); - if (c->device_parents == NULL) - goto badmem; - c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS); - if (c->bucket_parents == NULL) - goto badmem; - - c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); - if (c->buckets == NULL) - goto badmem; - c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); - if (c->rules == NULL) - goto badmem; - - /* buckets */ - for (i = 0; i < c->max_buckets; i++) { - int size = 0; - u32 alg; - struct crush_bucket *b; - - ceph_decode_32_safe(p, end, alg, bad); - if (alg == 0) { - c->buckets[i] = NULL; - continue; - } - dout("crush_decode bucket %d off %x %p to %p\n", - i, (int)(*p-start), *p, end); - - switch (alg) { - case CRUSH_BUCKET_UNIFORM: - size = sizeof(struct crush_bucket_uniform); - break; - case CRUSH_BUCKET_LIST: - size = sizeof(struct crush_bucket_list); - break; - case CRUSH_BUCKET_TREE: - size = sizeof(struct crush_bucket_tree); - break; - case CRUSH_BUCKET_STRAW: - size = sizeof(struct crush_bucket_straw); - break; - default: - err = -EINVAL; - goto bad; - } - BUG_ON(size == 0); - b = c->buckets[i] = kzalloc(size, GFP_NOFS); - if (b == NULL) - goto badmem; - - ceph_decode_need(p, end, 4*sizeof(u32), bad); - b->id = ceph_decode_32(p); - b->type = ceph_decode_16(p); - b->alg = ceph_decode_8(p); - b->hash = ceph_decode_8(p); - b->weight = ceph_decode_32(p); - b->size = ceph_decode_32(p); - - dout("crush_decode bucket size %d off %x %p to %p\n", - b->size, (int)(*p-start), *p, end); - - b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); - if (b->items == NULL) - goto badmem; - b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); - if (b->perm == NULL) - goto badmem; - b->perm_n = 0; - - ceph_decode_need(p, end, b->size*sizeof(u32), bad); - for (j = 0; j < b->size; j++) - b->items[j] = ceph_decode_32(p); - - switch (b->alg) { - case CRUSH_BUCKET_UNIFORM: - err = crush_decode_uniform_bucket(p, end, - (struct crush_bucket_uniform *)b); - if (err < 0) - goto bad; - break; - case CRUSH_BUCKET_LIST: - err = crush_decode_list_bucket(p, end, - (struct crush_bucket_list *)b); - if (err < 0) - goto bad; - break; - case CRUSH_BUCKET_TREE: - err = crush_decode_tree_bucket(p, end, - (struct crush_bucket_tree *)b); - if (err < 0) - goto bad; - break; - case CRUSH_BUCKET_STRAW: - err = crush_decode_straw_bucket(p, end, - (struct crush_bucket_straw *)b); - if (err < 0) - goto bad; - break; - } - } - - /* rules */ - dout("rule vec is %p\n", c->rules); - for (i = 0; i < c->max_rules; i++) { - u32 yes; - struct crush_rule *r; - - ceph_decode_32_safe(p, end, yes, bad); - if (!yes) { - dout("crush_decode NO rule %d off %x %p to %p\n", - i, (int)(*p-start), *p, end); - c->rules[i] = NULL; - continue; - } - - dout("crush_decode rule %d off %x %p to %p\n", - i, (int)(*p-start), *p, end); - - /* len */ - ceph_decode_32_safe(p, end, yes, bad); -#if BITS_PER_LONG == 32 - err = -EINVAL; - if (yes > ULONG_MAX / sizeof(struct crush_rule_step)) - goto bad; -#endif - r = c->rules[i] = kmalloc(sizeof(*r) + - yes*sizeof(struct crush_rule_step), - GFP_NOFS); - if (r == NULL) - goto badmem; - dout(" rule %d is at %p\n", i, r); - r->len = yes; - ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ - ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); - for (j = 0; j < r->len; j++) { - r->steps[j].op = ceph_decode_32(p); - r->steps[j].arg1 = ceph_decode_32(p); - r->steps[j].arg2 = ceph_decode_32(p); - } - } - - /* ignore trailing name maps. */ - - dout("crush_decode success\n"); - return c; - -badmem: - err = -ENOMEM; -bad: - dout("crush_decode fail %d\n", err); - crush_destroy(c); - return ERR_PTR(err); -} - -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) - */ -static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) -{ - u64 a = *(u64 *)&l; - u64 b = *(u64 *)&r; - - if (a < b) - return -1; - if (a > b) - return 1; - return 0; -} - -static int __insert_pg_mapping(struct ceph_pg_mapping *new, - struct rb_root *root) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_mapping *pg = NULL; - int c; - - while (*p) { - parent = *p; - pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = pgid_cmp(new->pgid, pg->pgid); - if (c < 0) - p = &(*p)->rb_left; - else if (c > 0) - p = &(*p)->rb_right; - else - return -EEXIST; - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); - return 0; -} - -static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, - struct ceph_pg pgid) -{ - struct rb_node *n = root->rb_node; - struct ceph_pg_mapping *pg; - int c; - - while (n) { - pg = rb_entry(n, struct ceph_pg_mapping, node); - c = pgid_cmp(pgid, pg->pgid); - if (c < 0) - n = n->rb_left; - else if (c > 0) - n = n->rb_right; - else - return pg; - } - return NULL; -} - -/* - * rbtree of pg pool info - */ -static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_pool_info *pi = NULL; - - while (*p) { - parent = *p; - pi = rb_entry(parent, struct ceph_pg_pool_info, node); - if (new->id < pi->id) - p = &(*p)->rb_left; - else if (new->id > pi->id) - p = &(*p)->rb_right; - else - return -EEXIST; - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); - return 0; -} - -static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) -{ - struct ceph_pg_pool_info *pi; - struct rb_node *n = root->rb_node; - - while (n) { - pi = rb_entry(n, struct ceph_pg_pool_info, node); - if (id < pi->id) - n = n->rb_left; - else if (id > pi->id) - n = n->rb_right; - else - return pi; - } - return NULL; -} - -static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) -{ - rb_erase(&pi->node, root); - kfree(pi->name); - kfree(pi); -} - -void __decode_pool(void **p, struct ceph_pg_pool_info *pi) -{ - ceph_decode_copy(p, &pi->v, sizeof(pi->v)); - calc_pg_masks(pi); - *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); - *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; -} - -static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) -{ - struct ceph_pg_pool_info *pi; - u32 num, len, pool; - - ceph_decode_32_safe(p, end, num, bad); - dout(" %d pool names\n", num); - while (num--) { - ceph_decode_32_safe(p, end, pool, bad); - ceph_decode_32_safe(p, end, len, bad); - dout(" pool %d len %d\n", pool, len); - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (pi) { - kfree(pi->name); - pi->name = kmalloc(len + 1, GFP_NOFS); - if (pi->name) { - memcpy(pi->name, *p, len); - pi->name[len] = '\0'; - dout(" name is %s\n", pi->name); - } - } - *p += len; - } - return 0; - -bad: - return -EINVAL; -} - -/* - * osd map - */ -void ceph_osdmap_destroy(struct ceph_osdmap *map) -{ - dout("osdmap_destroy %p\n", map); - if (map->crush) - crush_destroy(map->crush); - while (!RB_EMPTY_ROOT(&map->pg_temp)) { - struct ceph_pg_mapping *pg = - rb_entry(rb_first(&map->pg_temp), - struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->pg_temp); - kfree(pg); - } - while (!RB_EMPTY_ROOT(&map->pg_pools)) { - struct ceph_pg_pool_info *pi = - rb_entry(rb_first(&map->pg_pools), - struct ceph_pg_pool_info, node); - __remove_pg_pool(&map->pg_pools, pi); - } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map); -} - -/* - * adjust max osd value. reallocate arrays. - */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) -{ - u8 *state; - struct ceph_entity_addr *addr; - u32 *weight; - - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { - kfree(state); - kfree(addr); - kfree(weight); - return -ENOMEM; - } - - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); - } - - map->osd_state = state; - map->osd_weight = weight; - map->osd_addr = addr; - map->max_osd = max; - return 0; -} - -/* - * decode a full map. - */ -struct ceph_osdmap *osdmap_decode(void **p, void *end) -{ - struct ceph_osdmap *map; - u16 version; - u32 len, max, i; - u8 ev; - int err = -EINVAL; - void *start = *p; - struct ceph_pg_pool_info *pi; - - dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); - - map = kzalloc(sizeof(*map), GFP_NOFS); - if (map == NULL) - return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; - - ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_VERSION) { - pr_warning("got unknown v %d > %d of osdmap\n", version, - CEPH_OSDMAP_VERSION); - goto bad; - } - - ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); - ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); - map->epoch = ceph_decode_32(p); - ceph_decode_copy(p, &map->created, sizeof(map->created)); - ceph_decode_copy(p, &map->modified, sizeof(map->modified)); - - ceph_decode_32_safe(p, end, max, bad); - while (max--) { - ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) - goto bad; - pi->id = ceph_decode_32(p); - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - kfree(pi); - goto bad; - } - __decode_pool(p, pi); - __insert_pg_pool(&map->pg_pools, pi); - } - - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; - - ceph_decode_32_safe(p, end, map->pool_max, bad); - - ceph_decode_32_safe(p, end, map->flags, bad); - - max = ceph_decode_32(p); - - /* (re)alloc osd arrays */ - err = osdmap_set_max_osd(map, max); - if (err < 0) - goto bad; - dout("osdmap_decode max_osd = %d\n", map->max_osd); - - /* osds */ - err = -EINVAL; - ceph_decode_need(p, end, 3*sizeof(u32) + - map->max_osd*(1 + sizeof(*map->osd_weight) + - sizeof(*map->osd_addr)), bad); - *p += 4; /* skip length field (should match max) */ - ceph_decode_copy(p, map->osd_state, map->max_osd); - - *p += 4; /* skip length field (should match max) */ - for (i = 0; i < map->max_osd; i++) - map->osd_weight[i] = ceph_decode_32(p); - - *p += 4; /* skip length field (should match max) */ - ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); - for (i = 0; i < map->max_osd; i++) - ceph_decode_addr(&map->osd_addr[i]); - - /* pg_temp */ - ceph_decode_32_safe(p, end, len, bad); - for (i = 0; i < len; i++) { - int n, j; - struct ceph_pg pgid; - struct ceph_pg_mapping *pg; - - ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); - n = ceph_decode_32(p); - ceph_decode_need(p, end, n * sizeof(u32), bad); - err = -ENOMEM; - pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); - if (!pg) - goto bad; - pg->pgid = pgid; - pg->len = n; - for (j = 0; j < n; j++) - pg->osds[j] = ceph_decode_32(p); - - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) - goto bad; - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len); - } - - /* crush */ - ceph_decode_32_safe(p, end, len, bad); - dout("osdmap_decode crush len %d from off 0x%x\n", len, - (int)(*p - start)); - ceph_decode_need(p, end, len, bad); - map->crush = crush_decode(*p, end); - *p += len; - if (IS_ERR(map->crush)) { - err = PTR_ERR(map->crush); - map->crush = NULL; - goto bad; - } - - /* ignore the rest of the map */ - *p = end; - - dout("osdmap_decode done %p %p\n", *p, end); - return map; - -bad: - dout("osdmap_decode fail\n"); - ceph_osdmap_destroy(map); - return ERR_PTR(err); -} - -/* - * decode and apply an incremental map update. - */ -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr) -{ - struct crush_map *newcrush = NULL; - struct ceph_fsid fsid; - u32 epoch = 0; - struct ceph_timespec modified; - u32 len, pool; - __s32 new_pool_max, new_flags, max; - void *start = *p; - int err = -EINVAL; - u16 version; - struct rb_node *rbp; - - ceph_decode_16_safe(p, end, version, bad); - if (version > CEPH_OSDMAP_INC_VERSION) { - pr_warning("got unknown v %d > %d of inc osdmap\n", version, - CEPH_OSDMAP_INC_VERSION); - goto bad; - } - - ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), - bad); - ceph_decode_copy(p, &fsid, sizeof(fsid)); - epoch = ceph_decode_32(p); - BUG_ON(epoch != map->epoch+1); - ceph_decode_copy(p, &modified, sizeof(modified)); - new_pool_max = ceph_decode_32(p); - new_flags = ceph_decode_32(p); - - /* full map? */ - ceph_decode_32_safe(p, end, len, bad); - if (len > 0) { - dout("apply_incremental full map len %d, %p to %p\n", - len, *p, end); - return osdmap_decode(p, min(*p+len, end)); - } - - /* new crush? */ - ceph_decode_32_safe(p, end, len, bad); - if (len > 0) { - dout("apply_incremental new crush map len %d, %p to %p\n", - len, *p, end); - newcrush = crush_decode(*p, min(*p+len, end)); - if (IS_ERR(newcrush)) - return ERR_CAST(newcrush); - *p += len; - } - - /* new flags? */ - if (new_flags >= 0) - map->flags = new_flags; - if (new_pool_max >= 0) - map->pool_max = new_pool_max; - - ceph_decode_need(p, end, 5*sizeof(u32), bad); - - /* new max? */ - max = ceph_decode_32(p); - if (max >= 0) { - err = osdmap_set_max_osd(map, max); - if (err < 0) - goto bad; - } - - map->epoch++; - map->modified = map->modified; - if (newcrush) { - if (map->crush) - crush_destroy(map->crush); - map->crush = newcrush; - newcrush = NULL; - } - - /* new_pool */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - __u8 ev; - struct ceph_pg_pool_info *pi; - - ceph_decode_32_safe(p, end, pool, bad); - ceph_decode_need(p, end, 1 + sizeof(pi->v), bad); - ev = ceph_decode_8(p); /* encoding version */ - if (ev > CEPH_PG_POOL_VERSION) { - pr_warning("got unknown v %d > %d of ceph_pg_pool\n", - ev, CEPH_PG_POOL_VERSION); - goto bad; - } - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (!pi) { - pi = kzalloc(sizeof(*pi), GFP_NOFS); - if (!pi) { - err = -ENOMEM; - goto bad; - } - pi->id = pool; - __insert_pg_pool(&map->pg_pools, pi); - } - __decode_pool(p, pi); - } - if (version >= 5 && __decode_pool_names(p, end, map) < 0) - goto bad; - - /* old_pool */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_pool_info *pi; - - ceph_decode_32_safe(p, end, pool, bad); - pi = __lookup_pg_pool(&map->pg_pools, pool); - if (pi) - __remove_pg_pool(&map->pg_pools, pi); - } - - /* new_up */ - err = -EINVAL; - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - u32 osd; - struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, bad); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); - ceph_decode_addr(&addr); - pr_info("osd%d up\n", osd); - BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP; - map->osd_addr[osd] = addr; - } - - /* new_down */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - u32 osd; - ceph_decode_32_safe(p, end, osd, bad); - (*p)++; /* clean flag */ - pr_info("osd%d down\n", osd); - if (osd < map->max_osd) - map->osd_state[osd] &= ~CEPH_OSD_UP; - } - - /* new_weight */ - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, bad); - osd = ceph_decode_32(p); - off = ceph_decode_32(p); - pr_info("osd%d weight 0x%x %s\n", osd, off, - off == CEPH_OSD_IN ? "(in)" : - (off == CEPH_OSD_OUT ? "(out)" : "")); - if (osd < map->max_osd) - map->osd_weight[osd] = off; - } - - /* new_pg_temp */ - rbp = rb_first(&map->pg_temp); - ceph_decode_32_safe(p, end, len, bad); - while (len--) { - struct ceph_pg_mapping *pg; - int j; - struct ceph_pg pgid; - u32 pglen; - ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); - ceph_decode_copy(p, &pgid, sizeof(pgid)); - pglen = ceph_decode_32(p); - - /* remove any? */ - while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping, - node)->pgid, pgid) <= 0) { - struct rb_node *cur = rbp; - rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); - } - - if (pglen) { - /* insert */ - ceph_decode_need(p, end, pglen*sizeof(u32), bad); - pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); - if (!pg) { - err = -ENOMEM; - goto bad; - } - pg->pgid = pgid; - pg->len = pglen; - for (j = 0; j < pglen; j++) - pg->osds[j] = ceph_decode_32(p); - err = __insert_pg_mapping(pg, &map->pg_temp); - if (err) - goto bad; - dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, - pglen); - } - } - while (rbp) { - struct rb_node *cur = rbp; - rbp = rb_next(rbp); - dout(" removed pg_temp %llx\n", - *(u64 *)&rb_entry(cur, struct ceph_pg_mapping, - node)->pgid); - rb_erase(cur, &map->pg_temp); - } - - /* ignore the rest */ - *p = end; - return map; - -bad: - pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", - epoch, (int)(*p - start), *p, start, end); - print_hex_dump(KERN_DEBUG, "osdmap: ", - DUMP_PREFIX_OFFSET, 16, 1, - start, end - start, true); - if (newcrush) - crush_destroy(newcrush); - return ERR_PTR(err); -} - - - - -/* - * calculate file layout from given offset, length. - * fill in correct oid, logical length, and object extent - * offset, length. - * - * for now, we write only a single su, until we can - * pass a stride back to the caller. - */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, - u64 *ono, - u64 *oxoff, u64 *oxlen) -{ - u32 osize = le32_to_cpu(layout->fl_object_size); - u32 su = le32_to_cpu(layout->fl_stripe_unit); - u32 sc = le32_to_cpu(layout->fl_stripe_count); - u32 bl, stripeno, stripepos, objsetno; - u32 su_per_object; - u64 t, su_offset; - - dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, - osize, su); - su_per_object = osize / su; - dout("osize %u / su %u = su_per_object %u\n", osize, su, - su_per_object); - - BUG_ON((su & ~PAGE_MASK) != 0); - /* bl = *off / su; */ - t = off; - do_div(t, su); - bl = t; - dout("off %llu / su %u = bl %u\n", off, su, bl); - - stripeno = bl / sc; - stripepos = bl % sc; - objsetno = stripeno / su_per_object; - - *ono = objsetno * sc + stripepos; - dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono); - - /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ - t = off; - su_offset = do_div(t, su); - *oxoff = su_offset + (stripeno % su_per_object) * su; - - /* - * Calculate the length of the extent being written to the selected - * object. This is the minimum of the full length requested (plen) or - * the remainder of the current stripe being written to. - */ - *oxlen = min_t(u64, *plen, su - su_offset); - *plen = *oxlen; - - dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); -} - -/* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap - */ -int ceph_calc_object_layout(struct ceph_object_layout *ol, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) -{ - unsigned num, num_mask; - struct ceph_pg pgid; - s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred); - int poolid = le32_to_cpu(fl->fl_pg_pool); - struct ceph_pg_pool_info *pool; - unsigned ps; - - BUG_ON(!osdmap); - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); - if (!pool) - return -EIO; - ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid)); - if (preferred >= 0) { - ps += preferred; - num = le32_to_cpu(pool->v.lpg_num); - num_mask = pool->lpg_num_mask; - } else { - num = le32_to_cpu(pool->v.pg_num); - num_mask = pool->pg_num_mask; - } - - pgid.ps = cpu_to_le16(ps); - pgid.preferred = cpu_to_le16(preferred); - pgid.pool = fl->fl_pg_pool; - if (preferred >= 0) - dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps, - (int)preferred); - else - dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps); - - ol->ol_pgid = pgid; - ol->ol_stripe_unit = fl->fl_object_stripe_unit; - return 0; -} - -/* - * Calculate raw osd vector for the given pgid. Return pointer to osd - * array, or NULL on failure. - */ -static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *num) -{ - struct ceph_pg_mapping *pg; - struct ceph_pg_pool_info *pool; - int ruleno; - unsigned poolid, ps, pps; - int preferred; - - /* pg_temp? */ - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); - if (pg) { - *num = pg->len; - return pg->osds; - } - - /* crush */ - poolid = le32_to_cpu(pgid.pool); - ps = le16_to_cpu(pgid.ps); - preferred = (s16)le16_to_cpu(pgid.preferred); - - /* don't forcefeed bad device ids to crush */ - if (preferred >= osdmap->max_osd || - preferred >= osdmap->crush->max_devices) - preferred = -1; - - pool = __lookup_pg_pool(&osdmap->pg_pools, poolid); - if (!pool) - return NULL; - ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, - pool->v.type, pool->v.size); - if (ruleno < 0) { - pr_err("no crush rule pool %d type %d size %d\n", - poolid, pool->v.type, pool->v.size); - return NULL; - } - - if (preferred >= 0) - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.lpgp_num), - pool->lpgp_num_mask); - else - pps = ceph_stable_mod(ps, - le32_to_cpu(pool->v.pgp_num), - pool->pgp_num_mask); - pps += poolid; - *num = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->v.size, *num), - preferred, osdmap->osd_weight); - return osds; -} - -/* - * Return acting set for given pgid. - */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting) -{ - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, o, num = CEPH_PG_MAX_SIZE; - - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; - - /* primary is first up osd */ - o = 0; - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - acting[o++] = osds[i]; - return o; -} - -/* - * Return primary osd for given pgid, or -1 if none. - */ -int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) -{ - int rawosds[CEPH_PG_MAX_SIZE], *osds; - int i, num = CEPH_PG_MAX_SIZE; - - osds = calc_pg_raw(osdmap, pgid, rawosds, &num); - if (!osds) - return -1; - - /* primary is first up osd */ - for (i = 0; i < num; i++) - if (ceph_osd_is_up(osdmap, osds[i])) - return osds[i]; - return -1; -} diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h deleted file mode 100644 index 970b547e510..00000000000 --- a/fs/ceph/osdmap.h +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef _FS_CEPH_OSDMAP_H -#define _FS_CEPH_OSDMAP_H - -#include <linux/rbtree.h> -#include "types.h" -#include "ceph_fs.h" -#include "crush/crush.h" - -/* - * The osd map describes the current membership of the osd cluster and - * specifies the mapping of objects to placement groups and placement - * groups to (sets of) osds. That is, it completely specifies the - * (desired) distribution of all data objects in the system at some - * point in time. - * - * Each map version is identified by an epoch, which increases monotonically. - * - * The map can be updated either via an incremental map (diff) describing - * the change between two successive epochs, or as a fully encoded map. - */ -struct ceph_pg_pool_info { - struct rb_node node; - int id; - struct ceph_pg_pool v; - int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; - char *name; -}; - -struct ceph_pg_mapping { - struct rb_node node; - struct ceph_pg pgid; - int len; - int osds[]; -}; - -struct ceph_osdmap { - struct ceph_fsid fsid; - u32 epoch; - u32 mkfs_epoch; - struct ceph_timespec created, modified; - - u32 flags; /* CEPH_OSDMAP_* */ - - u32 max_osd; /* size of osd_state, _offload, _addr arrays */ - u8 *osd_state; /* CEPH_OSD_* */ - u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ - struct ceph_entity_addr *osd_addr; - - struct rb_root pg_temp; - struct rb_root pg_pools; - u32 pool_max; - - /* the CRUSH map specifies the mapping of placement groups to - * the list of osds that store+replicate them. */ - struct crush_map *crush; -}; - -/* - * file layout helpers - */ -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_preferred(l) \ - ((__s32)le32_to_cpu((l).fl_pg_preferred)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) - -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} - -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); -} - - -static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) -{ - return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); -} - -static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) -{ - return map && (map->flags & flag); -} - -extern char *ceph_osdmap_state_str(char *str, int len, int state); - -static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, - int osd) -{ - if (osd >= map->max_osd) - return NULL; - return &map->osd_addr[osd]; -} - -extern struct ceph_osdmap *osdmap_decode(void **p, void *end); -extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr); -extern void ceph_osdmap_destroy(struct ceph_osdmap *map); - -/* calculate mapping of a file extent to an object */ -extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, - u64 *bno, u64 *oxoff, u64 *oxlen); - -/* calculate mapping of object to a placement group */ -extern int ceph_calc_object_layout(struct ceph_object_layout *ol, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap); -extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *acting); -extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, - struct ceph_pg pgid); - -#endif diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c deleted file mode 100644 index b6859f47d36..00000000000 --- a/fs/ceph/pagelist.c +++ /dev/null @@ -1,55 +0,0 @@ - -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> - -#include "pagelist.h" - -int ceph_pagelist_release(struct ceph_pagelist *pl) -{ - if (pl->mapped_tail) - kunmap(pl->mapped_tail); - while (!list_empty(&pl->head)) { - struct page *page = list_first_entry(&pl->head, struct page, - lru); - list_del(&page->lru); - __free_page(page); - } - return 0; -} - -static int ceph_pagelist_addpage(struct ceph_pagelist *pl) -{ - struct page *page = __page_cache_alloc(GFP_NOFS); - if (!page) - return -ENOMEM; - pl->room += PAGE_SIZE; - list_add_tail(&page->lru, &pl->head); - if (pl->mapped_tail) - kunmap(pl->mapped_tail); - pl->mapped_tail = kmap(page); - return 0; -} - -int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len) -{ - while (pl->room < len) { - size_t bit = pl->room; - int ret; - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), - buf, bit); - pl->length += bit; - pl->room -= bit; - buf += bit; - len -= bit; - ret = ceph_pagelist_addpage(pl); - if (ret) - return ret; - } - - memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); - pl->length += len; - pl->room -= len; - return 0; -} diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h deleted file mode 100644 index e8a4187e108..00000000000 --- a/fs/ceph/pagelist.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef __FS_CEPH_PAGELIST_H -#define __FS_CEPH_PAGELIST_H - -#include <linux/list.h> - -struct ceph_pagelist { - struct list_head head; - void *mapped_tail; - size_t length; - size_t room; -}; - -static inline void ceph_pagelist_init(struct ceph_pagelist *pl) -{ - INIT_LIST_HEAD(&pl->head); - pl->mapped_tail = NULL; - pl->length = 0; - pl->room = 0; -} -extern int ceph_pagelist_release(struct ceph_pagelist *pl); - -extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); - -static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) -{ - __le64 ev = cpu_to_le64(v); - return ceph_pagelist_append(pl, &ev, sizeof(ev)); -} -static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v) -{ - __le32 ev = cpu_to_le32(v); - return ceph_pagelist_append(pl, &ev, sizeof(ev)); -} -static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v) -{ - __le16 ev = cpu_to_le16(v); - return ceph_pagelist_append(pl, &ev, sizeof(ev)); -} -static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) -{ - return ceph_pagelist_append(pl, &v, 1); -} -static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, - char *s, size_t len) -{ - int ret = ceph_pagelist_encode_32(pl, len); - if (ret) - return ret; - if (len) - return ceph_pagelist_append(pl, s, len); - return 0; -} - -#endif diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h deleted file mode 100644 index 8fcc023056c..00000000000 --- a/fs/ceph/rados.h +++ /dev/null @@ -1,396 +0,0 @@ -#ifndef __RADOS_H -#define __RADOS_H - -/* - * Data types for the Ceph distributed object storage layer RADOS - * (Reliable Autonomic Distributed Object Store). - */ - -#include "msgr.h" - -/* - * osdmap encoding versions - */ -#define CEPH_OSDMAP_INC_VERSION 5 -#define CEPH_OSDMAP_INC_VERSION_EXT 5 -#define CEPH_OSDMAP_VERSION 5 -#define CEPH_OSDMAP_VERSION_EXT 5 - -/* - * fs id - */ -struct ceph_fsid { - unsigned char fsid[16]; -}; - -static inline int ceph_fsid_compare(const struct ceph_fsid *a, - const struct ceph_fsid *b) -{ - return memcmp(a, b, sizeof(*a)); -} - -/* - * ino, object, etc. - */ -typedef __le64 ceph_snapid_t; -#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ -#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ -#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ - -struct ceph_timespec { - __le32 tv_sec; - __le32 tv_nsec; -} __attribute__ ((packed)); - - -/* - * object layout - how objects are mapped into PGs - */ -#define CEPH_OBJECT_LAYOUT_HASH 1 -#define CEPH_OBJECT_LAYOUT_LINEAR 2 -#define CEPH_OBJECT_LAYOUT_HASHINO 3 - -/* - * pg layout -- how PGs are mapped onto (sets of) OSDs - */ -#define CEPH_PG_LAYOUT_CRUSH 0 -#define CEPH_PG_LAYOUT_HASH 1 -#define CEPH_PG_LAYOUT_LINEAR 2 -#define CEPH_PG_LAYOUT_HYBRID 3 - -#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ - -/* - * placement group. - * we encode this into one __le64. - */ -struct ceph_pg { - __le16 preferred; /* preferred primary osd */ - __le16 ps; /* placement seed */ - __le32 pool; /* object pool */ -} __attribute__ ((packed)); - -/* - * pg_pool is a set of pgs storing a pool of objects - * - * pg_num -- base number of pseudorandomly placed pgs - * - * pgp_num -- effective number when calculating pg placement. this - * is used for pg_num increases. new pgs result in data being "split" - * into new pgs. for this to proceed smoothly, new pgs are intiially - * colocated with their parents; that is, pgp_num doesn't increase - * until the new pgs have successfully split. only _then_ are the new - * pgs placed independently. - * - * lpg_num -- localized pg count (per device). replicas are randomly - * selected. - * - * lpgp_num -- as above. - */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 -#define CEPH_PG_POOL_VERSION 2 -struct ceph_pg_pool { - __u8 type; /* CEPH_PG_TYPE_* */ - __u8 size; /* number of osds in each pg */ - __u8 crush_ruleset; /* crush placement rule */ - __u8 object_hash; /* hash mapping object name to ps */ - __le32 pg_num, pgp_num; /* number of pg's */ - __le32 lpg_num, lpgp_num; /* number of localized pg's */ - __le32 last_change; /* most recent epoch changed */ - __le64 snap_seq; /* seq for per-pool snapshot */ - __le32 snap_epoch; /* epoch of last snap */ - __le32 num_snaps; - __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ - __le64 auid; /* who owns the pg */ -} __attribute__ ((packed)); - -/* - * stable_mod func is used to control number of placement groups. - * similar to straight-up modulo, but produces a stable mapping as b - * increases over time. b is the number of bins, and bmask is the - * containing power of 2 minus 1. - * - * b <= bmask and bmask=(2**n)-1 - * e.g., b=12 -> bmask=15, b=123 -> bmask=127 - */ -static inline int ceph_stable_mod(int x, int b, int bmask) -{ - if ((x & bmask) < b) - return x & bmask; - else - return x & (bmask >> 1); -} - -/* - * object layout - how a given object should be stored. - */ -struct ceph_object_layout { - struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ - __le32 ol_stripe_unit; /* for per-object parity, if any */ -} __attribute__ ((packed)); - -/* - * compound epoch+version, used by storage layer to serialize mutations - */ -struct ceph_eversion { - __le32 epoch; - __le64 version; -} __attribute__ ((packed)); - -/* - * osd map bits - */ - -/* status bits */ -#define CEPH_OSD_EXISTS 1 -#define CEPH_OSD_UP 2 - -/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ -#define CEPH_OSD_IN 0x10000 -#define CEPH_OSD_OUT 0 - - -/* - * osd map flag bits - */ -#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ -#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ -#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ -#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ -#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ - -/* - * osd ops - */ -#define CEPH_OSD_OP_MODE 0xf000 -#define CEPH_OSD_OP_MODE_RD 0x1000 -#define CEPH_OSD_OP_MODE_WR 0x2000 -#define CEPH_OSD_OP_MODE_RMW 0x3000 -#define CEPH_OSD_OP_MODE_SUB 0x4000 - -#define CEPH_OSD_OP_TYPE 0x0f00 -#define CEPH_OSD_OP_TYPE_LOCK 0x0100 -#define CEPH_OSD_OP_TYPE_DATA 0x0200 -#define CEPH_OSD_OP_TYPE_ATTR 0x0300 -#define CEPH_OSD_OP_TYPE_EXEC 0x0400 -#define CEPH_OSD_OP_TYPE_PG 0x0500 - -enum { - /** data **/ - /* read */ - CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, - - /* fancy read */ - CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, - - /* write */ - CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, - CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, - CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, - CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, - CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, - - /* fancy write */ - CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, - CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, - CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, - CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, - - CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, - CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, - CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, - - CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, - - /** attrs **/ - /* read */ - CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, - CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, - - /* write */ - CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, - CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, - CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, - CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, - - /** subop **/ - CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, - CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, - CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, - CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, - CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, - - /** lock **/ - CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, - CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, - CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, - CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, - CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, - CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, - - /** exec **/ - CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, - - /** pg **/ - CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, -}; - -static inline int ceph_osd_op_type_lock(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; -} -static inline int ceph_osd_op_type_data(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; -} -static inline int ceph_osd_op_type_attr(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; -} -static inline int ceph_osd_op_type_exec(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; -} -static inline int ceph_osd_op_type_pg(int op) -{ - return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; -} - -static inline int ceph_osd_op_mode_subop(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; -} -static inline int ceph_osd_op_mode_read(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; -} -static inline int ceph_osd_op_mode_modify(int op) -{ - return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; -} - -#define CEPH_OSD_TMAP_HDR 'h' -#define CEPH_OSD_TMAP_SET 's' -#define CEPH_OSD_TMAP_RM 'r' - -extern const char *ceph_osd_op_name(int op); - - -/* - * osd op flags - * - * An op may be READ, WRITE, or READ|WRITE. - */ -enum { - CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ - CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ - CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ - CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ - CEPH_OSD_FLAG_READ = 16, /* op may read */ - CEPH_OSD_FLAG_WRITE = 32, /* op may write */ - CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ - CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ - CEPH_OSD_FLAG_BALANCE_READS = 256, - CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ - CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ - CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ -}; - -enum { - CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ -}; - -#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ - -/* xattr comparison */ -enum { - CEPH_OSD_CMPXATTR_OP_NOP = 0, - CEPH_OSD_CMPXATTR_OP_EQ = 1, - CEPH_OSD_CMPXATTR_OP_NE = 2, - CEPH_OSD_CMPXATTR_OP_GT = 3, - CEPH_OSD_CMPXATTR_OP_GTE = 4, - CEPH_OSD_CMPXATTR_OP_LT = 5, - CEPH_OSD_CMPXATTR_OP_LTE = 6 -}; - -enum { - CEPH_OSD_CMPXATTR_MODE_STRING = 1, - CEPH_OSD_CMPXATTR_MODE_U64 = 2 -}; - -/* - * an individual object operation. each may be accompanied by some data - * payload - */ -struct ceph_osd_op { - __le16 op; /* CEPH_OSD_OP_* */ - __le32 flags; /* CEPH_OSD_FLAG_* */ - union { - struct { - __le64 offset, length; - __le64 truncate_size; - __le32 truncate_seq; - } __attribute__ ((packed)) extent; - struct { - __le32 name_len; - __le32 value_len; - __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ - __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ - } __attribute__ ((packed)) xattr; - struct { - __u8 class_len; - __u8 method_len; - __u8 argc; - __le32 indata_len; - } __attribute__ ((packed)) cls; - struct { - __le64 cookie, count; - } __attribute__ ((packed)) pgls; - }; - __le32 payload_len; -} __attribute__ ((packed)); - -/* - * osd request message header. each request may include multiple - * ceph_osd_op object operations. - */ -struct ceph_osd_request_head { - __le32 client_inc; /* client incarnation */ - struct ceph_object_layout layout; /* pgid */ - __le32 osdmap_epoch; /* client's osdmap epoch */ - - __le32 flags; - - struct ceph_timespec mtime; /* for mutations only */ - struct ceph_eversion reassert_version; /* if we are replaying op */ - - __le32 object_len; /* length of object name */ - - __le64 snapid; /* snapid to read */ - __le64 snap_seq; /* writer's snap context */ - __le32 num_snaps; - - __le16 num_ops; - struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ -} __attribute__ ((packed)); - -struct ceph_osd_reply_head { - __le32 client_inc; /* client incarnation */ - __le32 flags; - struct ceph_object_layout layout; - __le32 osdmap_epoch; - struct ceph_eversion reassert_version; /* for replaying uncommitted */ - - __le32 result; /* result code */ - - __le32 object_len; /* length of object name */ - __le32 num_ops; - struct ceph_osd_op ops[0]; /* ops[], object */ -} __attribute__ ((packed)); - - -#endif diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c0b26b6badb..f01645a2775 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1,10 +1,12 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/sort.h> #include <linux/slab.h> #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> /* * Snapshots in ceph are driven in large part by cooperation from the @@ -119,6 +121,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); + INIT_LIST_HEAD(&realm->dirty_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); @@ -203,7 +206,7 @@ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, up_write(&mdsc->snap_rwsem); } else { spin_lock(&mdsc->snap_empty_lock); - list_add(&mdsc->snap_empty, &realm->empty_item); + list_add(&realm->empty_item, &mdsc->snap_empty); spin_unlock(&mdsc->snap_empty_lock); } } @@ -293,8 +296,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; int err = 0; - int i; - int num = realm->num_prior_parent_snaps + realm->num_snaps; + u32 num = realm->num_prior_parent_snaps + realm->num_snaps; /* * build parent context, if it hasn't been built. @@ -318,28 +320,29 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - realm->cached_context->num_snaps); + (unsigned int) realm->cached_context->num_snaps); return 0; } /* alloc new snap context */ err = -ENOMEM; - if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc)) + if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) goto fail; - snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS); + snapc = ceph_create_snap_context(num, GFP_NOFS); if (!snapc) goto fail; - atomic_set(&snapc->nref, 1); /* build (reverse sorted) snap vector */ num = 0; snapc->seq = realm->seq; if (parent) { - /* include any of parent's snaps occuring _after_ my + u32 i; + + /* include any of parent's snaps occurring _after_ my parent became my parent */ for (i = 0; i < parent->cached_context->num_snaps; i++) if (parent->cached_context->snaps[i] >= @@ -358,8 +361,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n", - realm->ino, realm, snapc, snapc->seq, snapc->num_snaps); + dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", + realm->ino, realm, snapc, snapc->seq, + (unsigned int) snapc->num_snaps); if (realm->cached_context) ceph_put_snap_context(realm->cached_context); @@ -399,9 +403,9 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm) * helper to allocate and decode an array of snapids. free prior * instance, if any. */ -static int dup_array(u64 **dst, __le64 *src, int num) +static int dup_array(u64 **dst, __le64 *src, u32 num) { - int i; + u32 i; kfree(*dst); if (num) { @@ -435,7 +439,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; - int used; + int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); if (!capsnap) { @@ -443,8 +447,18 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) return; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); + + /* + * If there is a write in progress, treat that as a dirty Fw, + * even though it hasn't completed yet; by the time we finish + * up this capsnap it will be. + */ + if (used & CEPH_CAP_FILE_WR) + dirty |= CEPH_CAP_FILE_WR; + if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -452,27 +466,43 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; - igrab(inode); + /* + * if we are a sync write, we may need to go to the snaprealm + * to get the current snapc. + */ + if (!snapc) + snapc = ci->i_snap_realm->cached_context; + + dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", + inode, capsnap, snapc, ceph_cap_string(dirty)); + ihold(inode); atomic_set(&capsnap->nref, 1); capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); INIT_LIST_HEAD(&capsnap->flushing_item); - capsnap->follows = snapc->seq - 1; + capsnap->follows = snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = __ceph_caps_dirty(ci); + capsnap->dirty = dirty; capsnap->mode = inode->i_mode; capsnap->uid = inode->i_uid; capsnap->gid = inode->i_gid; - /* fixme? */ - capsnap->xattr_blob = NULL; - capsnap->xattr_len = 0; + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this @@ -480,7 +510,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; capsnap->context = snapc; - ci->i_head_snapc = NULL; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { @@ -497,7 +529,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) kfree(capsnap); } - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); } /* @@ -506,13 +538,13 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) * * If capsnap can now be flushed, add to snap_flush list, and return 1. * - * Caller must hold i_lock. + * Caller must hold i_ceph_lock. */ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; @@ -539,6 +571,45 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 1; /* caller may want to ceph_flush_snaps */ } +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + + list_for_each_entry(child, &realm->children, child_item) { + dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", + realm, realm->ino, child, child->ino); + list_del_init(&child->dirty_item); + list_add(&child->dirty_item, &realm->dirty_item); + } + + list_del_init(&realm->dirty_item); + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} /* * Parse and apply a snapblob "snap trace" from the MDS. This specifies @@ -556,6 +627,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm; int invalidate = 0; int err = -ENOMEM; + LIST_HEAD(dirty_realms); dout("update_snap_trace deletion=%d\n", deletion); more: @@ -578,45 +650,6 @@ more: } } - if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", - realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); - /* - * if the realm seq has changed, queue a cap_snap for every - * inode with open caps. we do this _before_ we update - * the realm info so that we prepare for writeback under the - * _previous_ snap context. - * - * ...unless it's a snap deletion! - */ - if (!deletion) { - struct ceph_inode_info *ci; - struct inode *lastinode = NULL; - - spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); - if (!inode) - continue; - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - lastinode = inode; - ceph_queue_cap_snap(ci); - spin_lock(&realm->inodes_with_caps_lock); - } - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - dout("update_snap_trace cap_snaps queued\n"); - } - - } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", - realm->ino, realm, realm->seq); - } - /* ensure the parent is correct */ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) @@ -624,6 +657,8 @@ more: invalidate += err; if (le64_to_cpu(ri->seq) > realm->seq) { + dout("update_snap_trace updating %llx %p %lld -> %lld\n", + realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); realm->created = le64_to_cpu(ri->created); @@ -641,9 +676,17 @@ more: if (err < 0) goto fail; + /* queue realm for cap_snap creation */ + list_add(&realm->dirty_item, &dirty_realms); + invalidate = 1; } else if (!realm->cached_context) { + dout("update_snap_trace %llx %p seq %lld new\n", + realm->ino, realm, realm->seq); invalidate = 1; + } else { + dout("update_snap_trace %llx %p seq %lld unchanged\n", + realm->ino, realm, realm->seq); } dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, @@ -656,6 +699,16 @@ more: if (invalidate) rebuild_snap_realms(realm); + /* + * queue cap snaps _after_ we've built the new snap contexts, + * so that i_head_snapc can be set appropriately. + */ + while (!list_empty(&dirty_realms)) { + realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, + dirty_item); + queue_realm_cap_snaps(realm); + } + __cleanup_empty_realms(mdsc); return 0; @@ -685,11 +738,11 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->vfs_inode; - igrab(inode); + ihold(inode); spin_unlock(&mdsc->snap_flush_lock); - spin_lock(&inode->i_lock); - __ceph_flush_snaps(ci, &session); - spin_unlock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); + __ceph_flush_snaps(ci, &session, 0); + spin_unlock(&ci->i_ceph_lock); iput(inode); spin_lock(&mdsc->snap_flush_lock); } @@ -718,7 +771,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg) { - struct super_block *sb = mdsc->client->sb; + struct super_block *sb = mdsc->fsc->sb; int mds = session->s_mds; u64 split; int op; @@ -789,12 +842,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; + struct ceph_snap_realm *oldrealm; if (!inode) continue; ci = ceph_inode(inode); - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (!ci->i_snap_realm) goto skip_inode; /* @@ -814,24 +868,25 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); /* - * Remove the inode from the realm's inode - * list, but don't add it to the new realm - * yet. We don't want the cap_snap to be - * queued (again) by ceph_update_snap_trace() - * below. Queue it _now_, under the old context. + * Move the inode to the new realm */ spin_lock(&realm->inodes_with_caps_lock); list_del_init(&ci->i_snap_realm_item); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + oldrealm = ci->i_snap_realm; + ci->i_snap_realm = realm; spin_unlock(&realm->inodes_with_caps_lock); - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); - ceph_queue_cap_snap(ci); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); iput(inode); continue; skip_inode: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); iput(inode); } @@ -853,43 +908,9 @@ skip_inode: ceph_update_snap_trace(mdsc, p, e, op == CEPH_SNAP_OP_DESTROY); - if (op == CEPH_SNAP_OP_SPLIT) { - /* - * ok, _now_ add the inodes into the new realm. - */ - for (i = 0; i < num_split_inos; i++) { - struct ceph_vino vino = { - .ino = le64_to_cpu(split_inos[i]), - .snap = CEPH_NOSNAP, - }; - struct inode *inode = ceph_find_inode(sb, vino); - struct ceph_inode_info *ci; - - if (!inode) - continue; - ci = ceph_inode(inode); - spin_lock(&inode->i_lock); - if (list_empty(&ci->i_snap_realm_item)) { - struct ceph_snap_realm *oldrealm = - ci->i_snap_realm; - - dout(" moving %p to split realm %llx %p\n", - inode, realm->ino, realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - } - spin_unlock(&inode->i_lock); - iput(inode); - } - + if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ ceph_put_snap_realm(mdsc, realm); - } __cleanup_empty_realms(mdsc); diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c index 7503aee828c..51cc23e4811 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/strings.c @@ -1,70 +1,9 @@ /* - * Ceph string constants + * Ceph fs string constants */ -#include "types.h" +#include <linux/module.h> +#include <linux/ceph/types.h> -const char *ceph_entity_type_name(int type) -{ - switch (type) { - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_AUTH: return "auth"; - default: return "unknown"; - } -} - -const char *ceph_osd_op_name(int op) -{ - switch (op) { - case CEPH_OSD_OP_READ: return "read"; - case CEPH_OSD_OP_STAT: return "stat"; - - case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; - - case CEPH_OSD_OP_WRITE: return "write"; - case CEPH_OSD_OP_DELETE: return "delete"; - case CEPH_OSD_OP_TRUNCATE: return "truncate"; - case CEPH_OSD_OP_ZERO: return "zero"; - case CEPH_OSD_OP_WRITEFULL: return "writefull"; - - case CEPH_OSD_OP_APPEND: return "append"; - case CEPH_OSD_OP_STARTSYNC: return "startsync"; - case CEPH_OSD_OP_SETTRUNC: return "settrunc"; - case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc"; - - case CEPH_OSD_OP_TMAPUP: return "tmapup"; - case CEPH_OSD_OP_TMAPGET: return "tmapget"; - case CEPH_OSD_OP_TMAPPUT: return "tmapput"; - - case CEPH_OSD_OP_GETXATTR: return "getxattr"; - case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; - case CEPH_OSD_OP_SETXATTR: return "setxattr"; - case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; - case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; - case CEPH_OSD_OP_RMXATTR: return "rmxattr"; - case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; - - case CEPH_OSD_OP_PULL: return "pull"; - case CEPH_OSD_OP_PUSH: return "push"; - case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; - case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - case CEPH_OSD_OP_SCRUB: return "scrub"; - - case CEPH_OSD_OP_WRLOCK: return "wrlock"; - case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; - case CEPH_OSD_OP_RDLOCK: return "rdlock"; - case CEPH_OSD_OP_RDUNLOCK: return "rdunlock"; - case CEPH_OSD_OP_UPLOCK: return "uplock"; - case CEPH_OSD_OP_DNLOCK: return "dnlock"; - - case CEPH_OSD_OP_CALL: return "call"; - - case CEPH_OSD_OP_PGLS: return "pgls"; - } - return "???"; -} const char *ceph_mds_state_name(int s) { @@ -76,6 +15,7 @@ const char *ceph_mds_state_name(int s) case CEPH_MDS_STATE_BOOT: return "up:boot"; case CEPH_MDS_STATE_STANDBY: return "up:standby"; case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; + case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay"; case CEPH_MDS_STATE_CREATING: return "up:creating"; case CEPH_MDS_STATE_STARTING: return "up:starting"; /* up and in */ @@ -101,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } @@ -111,10 +53,14 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; + case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; + case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; + case CEPH_MDS_OP_SETLAYOUT: return "setlayou"; + case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout"; case CEPH_MDS_OP_READDIR: return "readdir"; case CEPH_MDS_OP_MKNOD: return "mknod"; case CEPH_MDS_OP_LINK: return "link"; @@ -129,6 +75,8 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; + case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } return "???"; } @@ -174,17 +122,3 @@ const char *ceph_snap_op_name(int o) } return "???"; } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fa87f51e38e..06150fd745a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1,7 +1,8 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <linux/backing-dev.h> +#include <linux/ctype.h> #include <linux/fs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -14,10 +15,15 @@ #include <linux/statfs.h> #include <linux/string.h> -#include "decode.h" #include "super.h" -#include "mon_client.h" -#include "auth.h" +#include "mds_client.h" +#include "cache.h" + +#include <linux/ceph/ceph_features.h> +#include <linux/ceph/decode.h> +#include <linux/ceph/mon_client.h> +#include <linux/ceph/auth.h> +#include <linux/ceph/debugfs.h> /* * Ceph superblock operations @@ -25,36 +31,22 @@ * Handle the basics of mounting, unmounting. */ - -/* - * find filename portion of a path (/foo/bar/baz -> baz) - */ -const char *ceph_file_part(const char *s, int len) -{ - const char *e = s + len; - - while (e != s && *(e-1) != '/') - e--; - return e; -} - - /* * super ops */ static void ceph_put_super(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); /* * ensure we release the bdi before put_anon_super releases * the device name. */ - if (s->s_bdi == &client->backing_dev_info) { - bdi_unregister(&client->backing_dev_info); + if (s->s_bdi == &fsc->backing_dev_info) { + bdi_unregister(&fsc->backing_dev_info); s->s_bdi = NULL; } @@ -63,14 +55,14 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); - struct ceph_monmap *monmap = client->monc.monmap; + struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; int err; dout("statfs\n"); - err = ceph_monc_do_statfs(&client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, &st); if (err < 0) return err; @@ -80,17 +72,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) /* * express utilization in terms of large blocks to avoid * overflow on 32-bit machines. + * + * NOTE: for the time being, we make bsize == frsize to humor + * not-yet-ancient versions of glibc that are broken. + * Someday, we will probably want to report a real block + * size... whatever that may mean for a network file system! */ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); - buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >> - (CEPH_BLOCK_SHIFT-10); + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_files = le64_to_cpu(st.num_objects); buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - buf->f_frsize = PAGE_CACHE_SIZE; /* leave fsid little-endian, regardless of host endianness */ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); @@ -101,236 +97,31 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) } -static int ceph_syncfs(struct super_block *sb, int wait) -{ - dout("sync_fs %d\n", wait); - ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); - ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); - dout("sync_fs %d done\n", wait); - return 0; -} - -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - -/** - * ceph_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - */ -static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) +static int ceph_sync_fs(struct super_block *sb, int wait) { - struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb); - struct ceph_mount_args *args = client->mount_args; - - if (args->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", - le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]), - le64_to_cpu(*(__le64 *)&args->fsid.fsid[8])); - if (args->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (args->flags & CEPH_OPT_DIRSTAT) - seq_puts(m, ",dirstat"); - if ((args->flags & CEPH_OPT_RBYTES) == 0) - seq_puts(m, ",norbytes"); - if (args->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (args->flags & CEPH_OPT_NOASYNCREADDIR) - seq_puts(m, ",noasyncreaddir"); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", args->mount_timeout); - if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); - if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) - seq_printf(m, ",osdtimeout=%d", args->osd_timeout); - if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - args->osd_keepalive_timeout); - if (args->wsize) - seq_printf(m, ",wsize=%d", args->wsize); - if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) - seq_printf(m, ",rsize=%d", args->rsize); - if (args->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); - if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", - args->caps_wanted_delay_min); - if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", - args->caps_wanted_delay_max); - if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - args->cap_release_safety); - if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); - if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); - if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", args->snapdir_name); - if (args->name) - seq_printf(m, ",name=%s", args->name); - if (args->secret) - seq_puts(m, ",secret=<hidden>"); - return 0; -} - -/* - * caches - */ -struct kmem_cache *ceph_inode_cachep; -struct kmem_cache *ceph_cap_cachep; -struct kmem_cache *ceph_dentry_cachep; -struct kmem_cache *ceph_file_cachep; - -static void ceph_inode_init_once(void *foo) -{ - struct ceph_inode_info *ci = foo; - inode_init_once(&ci->vfs_inode); -} - -static int __init init_caches(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_info", - sizeof(struct ceph_inode_info), - __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - - ceph_cap_cachep = KMEM_CACHE(ceph_cap, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) - goto bad_cap; - - ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) - goto bad_dentry; - - ceph_file_cachep = KMEM_CACHE(ceph_file_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) - goto bad_file; + if (!wait) { + dout("sync_fs (non-blocking)\n"); + ceph_flush_dirty_caps(fsc->mdsc); + dout("sync_fs (non-blocking) done\n"); + return 0; + } + dout("sync_fs (blocking)\n"); + ceph_osdc_sync(&fsc->client->osdc); + ceph_mdsc_sync(fsc->mdsc); + dout("sync_fs (blocking) done\n"); return 0; - -bad_file: - kmem_cache_destroy(ceph_dentry_cachep); -bad_dentry: - kmem_cache_destroy(ceph_cap_cachep); -bad_cap: - kmem_cache_destroy(ceph_inode_cachep); - return -ENOMEM; -} - -static void destroy_caches(void) -{ - kmem_cache_destroy(ceph_inode_cachep); - kmem_cache_destroy(ceph_cap_cachep); - kmem_cache_destroy(ceph_dentry_cachep); - kmem_cache_destroy(ceph_file_cachep); -} - - -/* - * ceph_umount_begin - initiate forced umount. Tear down down the - * mount, skipping steps that may hang while waiting for server(s). - */ -static void ceph_umount_begin(struct super_block *sb) -{ - struct ceph_client *client = ceph_sb_to_client(sb); - - dout("ceph_umount_begin - starting forced umount\n"); - if (!client) - return; - client->mount_state = CEPH_MOUNT_SHUTDOWN; - return; -} - -static const struct super_operations ceph_super_ops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .write_inode = ceph_write_inode, - .sync_fs = ceph_syncfs, - .put_super = ceph_put_super, - .show_options = ceph_show_options, - .statfs = ceph_statfs, - .umount_begin = ceph_umount_begin, -}; - - -const char *ceph_msg_type_name(int type) -{ - switch (type) { - case CEPH_MSG_SHUTDOWN: return "shutdown"; - case CEPH_MSG_PING: return "ping"; - case CEPH_MSG_AUTH: return "auth"; - case CEPH_MSG_AUTH_REPLY: return "auth_reply"; - case CEPH_MSG_MON_MAP: return "mon_map"; - case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; - case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; - case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; - case CEPH_MSG_STATFS: return "statfs"; - case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; - case CEPH_MSG_MDS_MAP: return "mds_map"; - case CEPH_MSG_CLIENT_SESSION: return "client_session"; - case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; - case CEPH_MSG_CLIENT_REQUEST: return "client_request"; - case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; - case CEPH_MSG_CLIENT_REPLY: return "client_reply"; - case CEPH_MSG_CLIENT_CAPS: return "client_caps"; - case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; - case CEPH_MSG_CLIENT_SNAP: return "client_snap"; - case CEPH_MSG_CLIENT_LEASE: return "client_lease"; - case CEPH_MSG_OSD_MAP: return "osd_map"; - case CEPH_MSG_OSD_OP: return "osd_op"; - case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; - default: return "unknown"; - } } - /* * mount options */ enum { - Opt_fsidmajor, - Opt_fsidminor, - Opt_monport, Opt_wsize, Opt_rsize, - Opt_osdtimeout, - Opt_osdkeepalivetimeout, - Opt_mount_timeout, - Opt_osd_idle_ttl, + Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, Opt_cap_release_safety, @@ -340,30 +131,30 @@ enum { Opt_last_int, /* int args above */ Opt_snapdirname, - Opt_name, - Opt_secret, Opt_last_string, /* string args above */ - Opt_ip, - Opt_noshare, Opt_dirstat, Opt_nodirstat, Opt_rbytes, Opt_norbytes, - Opt_nocrc, + Opt_asyncreaddir, Opt_noasyncreaddir, + Opt_dcache, + Opt_nodcache, + Opt_ino32, + Opt_noino32, + Opt_fscache, + Opt_nofscache, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + Opt_acl, +#endif + Opt_noacl }; -static match_table_t arg_tokens = { - {Opt_fsidmajor, "fsidmajor=%ld"}, - {Opt_fsidminor, "fsidminor=%ld"}, - {Opt_monport, "monport=%d"}, +static match_table_t fsopt_tokens = { {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, - {Opt_osdtimeout, "osdtimeout=%d"}, - {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, - {Opt_mount_timeout, "mount_timeout=%d"}, - {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, {Opt_cap_release_safety, "cap_release_safety=%d"}, @@ -372,372 +163,569 @@ static match_table_t arg_tokens = { {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, - {Opt_name, "name=%s"}, - {Opt_secret, "secret=%s"}, /* string args above */ - {Opt_ip, "ip=%s"}, - {Opt_noshare, "noshare"}, {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, {Opt_rbytes, "rbytes"}, {Opt_norbytes, "norbytes"}, - {Opt_nocrc, "nocrc"}, + {Opt_asyncreaddir, "asyncreaddir"}, {Opt_noasyncreaddir, "noasyncreaddir"}, + {Opt_dcache, "dcache"}, + {Opt_nodcache, "nodcache"}, + {Opt_ino32, "ino32"}, + {Opt_noino32, "noino32"}, + {Opt_fscache, "fsc"}, + {Opt_nofscache, "nofsc"}, +#ifdef CONFIG_CEPH_FS_POSIX_ACL + {Opt_acl, "acl"}, +#endif + {Opt_noacl, "noacl"}, {-1, NULL} }; - -static struct ceph_mount_args *parse_mount_args(int flags, char *options, - const char *dev_name, - const char **path) +static int parse_fsopt_token(char *c, void *private) { - struct ceph_mount_args *args; - const char *c; - int err = -ENOMEM; + struct ceph_mount_options *fsopt = private; substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token((char *)c, fsopt_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } - args = kzalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return ERR_PTR(-ENOMEM); - args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), - GFP_KERNEL); - if (!args->mon_addr) - goto out; + switch (token) { + case Opt_snapdirname: + kfree(fsopt->snapdir_name); + fsopt->snapdir_name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->snapdir_name) + return -ENOMEM; + break; + + /* misc */ + case Opt_wsize: + fsopt->wsize = intval; + break; + case Opt_rsize: + fsopt->rsize = intval; + break; + case Opt_rasize: + fsopt->rasize = intval; + break; + case Opt_caps_wanted_delay_min: + fsopt->caps_wanted_delay_min = intval; + break; + case Opt_caps_wanted_delay_max: + fsopt->caps_wanted_delay_max = intval; + break; + case Opt_readdir_max_entries: + fsopt->max_readdir = intval; + break; + case Opt_readdir_max_bytes: + fsopt->max_readdir_bytes = intval; + break; + case Opt_congestion_kb: + fsopt->congestion_kb = intval; + break; + case Opt_dirstat: + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_nodirstat: + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + break; + case Opt_rbytes: + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_norbytes: + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + break; + case Opt_asyncreaddir: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_noasyncreaddir: + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + break; + case Opt_dcache: + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_nodcache: + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + break; + case Opt_ino32: + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + break; + case Opt_noino32: + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + break; + case Opt_fscache: + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; + case Opt_nofscache: + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + break; +#ifdef CONFIG_CEPH_FS_POSIX_ACL + case Opt_acl: + fsopt->sb_flags |= MS_POSIXACL; + break; +#endif + case Opt_noacl: + fsopt->sb_flags &= ~MS_POSIXACL; + break; + default: + BUG_ON(token); + } + return 0; +} - dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); - - /* start with defaults */ - args->sb_flags = flags; - args->flags = CEPH_OPT_DEFAULT; - args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; - args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; - args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ - args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ - args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; - args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; - args->max_readdir = CEPH_MAX_READDIR_DEFAULT; - args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - args->congestion_kb = default_congestion_kb(); - - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ +static void destroy_mount_options(struct ceph_mount_options *args) +{ + dout("destroy_mount_options %p\n", args); + kfree(args->snapdir_name); + kfree(args); +} + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +static int compare_mount_options(struct ceph_mount_options *new_fsopt, + struct ceph_options *new_opt, + struct ceph_fs_client *fsc) +{ + struct ceph_mount_options *fsopt1 = new_fsopt; + struct ceph_mount_options *fsopt2 = fsc->mount_options; + int ofs = offsetof(struct ceph_mount_options, snapdir_name); + int ret; + + ret = memcmp(fsopt1, fsopt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name); + if (ret) + return ret; + + return ceph_compare_options(new_opt, fsc->client); +} + +static int parse_mount_options(struct ceph_mount_options **pfsopt, + struct ceph_options **popt, + int flags, char *options, + const char *dev_name, + const char **path) +{ + struct ceph_mount_options *fsopt; + const char *dev_name_end; + int err; + + if (!dev_name || !*dev_name) + return -EINVAL; + + fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); + if (!fsopt) + return -ENOMEM; + + dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); + + fsopt->sb_flags = flags; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + /* + * Distinguish the server list from the path in "dev_name". + * Internally we do not include the leading '/' in the path. + * + * "dev_name" will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + /* skip over leading '/' for path */ + *path = dev_name_end + 1; + } else { + /* path is empty */ + dev_name_end = dev_name + strlen(dev_name); + *path = dev_name_end; + } err = -EINVAL; - if (!dev_name) - goto out; - *path = strstr(dev_name, ":/"); - if (*path == NULL) { - pr_err("device name is missing path (no :/ in %s)\n", - dev_name); + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') { + pr_err("device name is missing path (no : separator in %s)\n", + dev_name); goto out; } - - /* get mon ip(s) */ - err = ceph_parse_ips(dev_name, *path, args->mon_addr, - CEPH_MAX_MON, &args->num_mon); - if (err < 0) - goto out; - - /* path on server */ - *path += 2; + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); dout("server path '%s'\n", *path); - /* parse mount options */ - while ((c = strsep(&options, ",")) != NULL) { - int token, intval, ret; - if (!*c) - continue; - err = -EINVAL; - token = match_token((char *)c, arg_tokens, argstr); - if (token < 0) { - pr_err("bad mount option at '%s'\n", c); - goto out; - } - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); - continue; - } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else { - dout("got token %d\n", token); - } - switch (token) { - case Opt_fsidmajor: - *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval); - break; - case Opt_fsidminor: - *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval); - break; - case Opt_ip: - err = ceph_parse_ips(argstr[0].from, - argstr[0].to, - &args->my_addr, - 1, NULL); - if (err < 0) - goto out; - args->flags |= CEPH_OPT_MYIP; - break; - - case Opt_snapdirname: - kfree(args->snapdir_name); - args->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_name: - args->name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - case Opt_secret: - args->secret = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - break; - - /* misc */ - case Opt_wsize: - args->wsize = intval; - break; - case Opt_rsize: - args->rsize = intval; - break; - case Opt_osdtimeout: - args->osd_timeout = intval; - break; - case Opt_osdkeepalivetimeout: - args->osd_keepalive_timeout = intval; - break; - case Opt_mount_timeout: - args->mount_timeout = intval; - break; - case Opt_caps_wanted_delay_min: - args->caps_wanted_delay_min = intval; - break; - case Opt_caps_wanted_delay_max: - args->caps_wanted_delay_max = intval; - break; - case Opt_readdir_max_entries: - args->max_readdir = intval; - break; - case Opt_readdir_max_bytes: - args->max_readdir_bytes = intval; - break; - case Opt_congestion_kb: - args->congestion_kb = intval; - break; - - case Opt_noshare: - args->flags |= CEPH_OPT_NOSHARE; - break; - - case Opt_dirstat: - args->flags |= CEPH_OPT_DIRSTAT; - break; - case Opt_nodirstat: - args->flags &= ~CEPH_OPT_DIRSTAT; - break; - case Opt_rbytes: - args->flags |= CEPH_OPT_RBYTES; - break; - case Opt_norbytes: - args->flags &= ~CEPH_OPT_RBYTES; - break; - case Opt_nocrc: - args->flags |= CEPH_OPT_NOCRC; - break; - case Opt_noasyncreaddir: - args->flags |= CEPH_OPT_NOASYNCREADDIR; - break; - - default: - BUG_ON(token); - } + *popt = ceph_parse_options(options, dev_name, dev_name_end, + parse_fsopt_token, (void *)fsopt); + if (IS_ERR(*popt)) { + err = PTR_ERR(*popt); + goto out; } - return args; + + /* success */ + *pfsopt = fsopt; + return 0; out: - kfree(args->mon_addr); - kfree(args); - return ERR_PTR(err); + destroy_mount_options(fsopt); + return err; } -static void destroy_mount_args(struct ceph_mount_args *args) +/** + * ceph_show_options - Show mount options in /proc/mounts + * @m: seq_file to write to + * @root: root of that (sub)tree + */ +static int ceph_show_options(struct seq_file *m, struct dentry *root) { - dout("destroy_mount_args %p\n", args); - kfree(args->snapdir_name); - args->snapdir_name = NULL; - kfree(args->name); - args->name = NULL; - kfree(args->secret); - args->secret = NULL; - kfree(args); + struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); + struct ceph_mount_options *fsopt = fsc->mount_options; + struct ceph_options *opt = fsc->client->options; + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, ",fsid=%pU", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, ",noshare"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, ",nocrc"); + + if (opt->name) + seq_printf(m, ",name=%s", opt->name); + if (opt->key) + seq_puts(m, ",secret=<hidden>"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + opt->osd_keepalive_timeout); + + if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) + seq_puts(m, ",dirstat"); + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) + seq_puts(m, ",norbytes"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) + seq_puts(m, ",noasyncreaddir"); + if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) + seq_puts(m, ",dcache"); + else + seq_puts(m, ",nodcache"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) + seq_puts(m, ",fsc"); + else + seq_puts(m, ",nofsc"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + if (fsopt->sb_flags & MS_POSIXACL) + seq_puts(m, ",acl"); + else + seq_puts(m, ",noacl"); +#endif + + if (fsopt->wsize) + seq_printf(m, ",wsize=%d", fsopt->wsize); + if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", fsopt->rsize); + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) + seq_printf(m, ",rasize=%d", fsopt->rasize); + if (fsopt->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + fsopt->caps_wanted_delay_min); + if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + fsopt->caps_wanted_delay_max); + if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + fsopt->cap_release_safety); + if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) + seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } /* - * create a fresh client instance + * handle any mon messages the standard library doesn't understand. + * return error if we don't either. */ -static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) +static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg) { - struct ceph_client *client; + struct ceph_fs_client *fsc = client->private; + int type = le16_to_cpu(msg->hdr.type); + + switch (type) { + case CEPH_MSG_MDS_MAP: + ceph_mdsc_handle_map(fsc->mdsc, msg); + return 0; + + default: + return -1; + } +} + +/* + * create a new fs client + */ +static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, + struct ceph_options *opt) +{ + struct ceph_fs_client *fsc; + const u64 supported_features = + CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; + const u64 required_features = 0; + int page_count; + size_t size; int err = -ENOMEM; - client = kzalloc(sizeof(*client), GFP_KERNEL); - if (client == NULL) + fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); + if (!fsc) return ERR_PTR(-ENOMEM); - mutex_init(&client->mount_mutex); - - init_waitqueue_head(&client->auth_wq); + fsc->client = ceph_create_client(opt, fsc, supported_features, + required_features); + if (IS_ERR(fsc->client)) { + err = PTR_ERR(fsc->client); + goto fail; + } + fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.want_mdsmap = 1; - client->sb = NULL; - client->mount_state = CEPH_MOUNT_MOUNTING; - client->mount_args = args; + fsc->mount_options = fsopt; - client->msgr = NULL; + fsc->sb = NULL; + fsc->mount_state = CEPH_MOUNT_MOUNTING; - client->auth_err = 0; - atomic_long_set(&client->writeback_count, 0); + atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&client->backing_dev_info); + err = bdi_init(&fsc->backing_dev_info); if (err < 0) - goto fail; + goto fail_client; err = -ENOMEM; - client->wb_wq = create_workqueue("ceph-writeback"); - if (client->wb_wq == NULL) + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); + if (fsc->wb_wq == NULL) goto fail_bdi; - client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); - if (client->pg_inv_wq == NULL) + fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); + if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; - client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); - if (client->trunc_wq == NULL) + fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); + if (fsc->trunc_wq == NULL) goto fail_pg_inv_wq; /* set up mempools */ err = -ENOMEM; - client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, - client->mount_args->wsize >> PAGE_CACHE_SHIFT); - if (!client->wb_pagevec_pool) + page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT; + size = sizeof (struct page *) * (page_count ? page_count : 1); + fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); + if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; + /* setup fscache */ + if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && + (ceph_fscache_register_fs(fsc) != 0)) + goto fail_fscache; + /* caps */ - client->min_caps = args->max_readdir; - ceph_adjust_min_caps(client->min_caps); + fsc->min_caps = fsopt->max_readdir; - /* subsystems */ - err = ceph_monc_init(&client->monc, client); - if (err < 0) - goto fail_mempool; - err = ceph_osdc_init(&client->osdc, client); - if (err < 0) - goto fail_monc; - err = ceph_mdsc_init(&client->mdsc, client); - if (err < 0) - goto fail_osdc; - return client; - -fail_osdc: - ceph_osdc_stop(&client->osdc); -fail_monc: - ceph_monc_stop(&client->monc); -fail_mempool: - mempool_destroy(client->wb_pagevec_pool); + return fsc; + +fail_fscache: + ceph_fscache_unregister_fs(fsc); fail_trunc_wq: - destroy_workqueue(client->trunc_wq); + destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: - destroy_workqueue(client->pg_inv_wq); + destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: - destroy_workqueue(client->wb_wq); + destroy_workqueue(fsc->wb_wq); fail_bdi: - bdi_destroy(&client->backing_dev_info); + bdi_destroy(&fsc->backing_dev_info); +fail_client: + ceph_destroy_client(fsc->client); fail: - kfree(client); + kfree(fsc); return ERR_PTR(err); } -static void ceph_destroy_client(struct ceph_client *client) +static void destroy_fs_client(struct ceph_fs_client *fsc) { - dout("destroy_client %p\n", client); + dout("destroy_fs_client %p\n", fsc); - /* unmount */ - ceph_mdsc_stop(&client->mdsc); - ceph_osdc_stop(&client->osdc); - - /* - * make sure mds and osd connections close out before destroying - * the auth module, which is needed to free those connections' - * ceph_authorizers. - */ - ceph_msgr_flush(); + ceph_fscache_unregister_fs(fsc); - ceph_monc_stop(&client->monc); + destroy_workqueue(fsc->wb_wq); + destroy_workqueue(fsc->pg_inv_wq); + destroy_workqueue(fsc->trunc_wq); - ceph_adjust_min_caps(-client->min_caps); + bdi_destroy(&fsc->backing_dev_info); - ceph_debugfs_client_cleanup(client); - destroy_workqueue(client->wb_wq); - destroy_workqueue(client->pg_inv_wq); - destroy_workqueue(client->trunc_wq); + mempool_destroy(fsc->wb_pagevec_pool); - bdi_destroy(&client->backing_dev_info); + destroy_mount_options(fsc->mount_options); - if (client->msgr) - ceph_messenger_destroy(client->msgr); - mempool_destroy(client->wb_pagevec_pool); + ceph_fs_debugfs_cleanup(fsc); - destroy_mount_args(client->mount_args); + ceph_destroy_client(fsc->client); - kfree(client); - dout("destroy_client %p done\n", client); + kfree(fsc); + dout("destroy_fs_client %p done\n", fsc); } /* - * Initially learn our fsid, or verify an fsid matches. + * caches */ -int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +struct kmem_cache *ceph_inode_cachep; +struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_dentry_cachep; +struct kmem_cache *ceph_file_cachep; + +static void ceph_inode_init_once(void *foo) { - if (client->have_fsid) { - if (ceph_fsid_compare(&client->fsid, fsid)) { - pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, - PR_FSID(&client->fsid), PR_FSID(fsid)); - return -1; - } - } else { - pr_info("client%lld fsid " FSID_FORMAT "\n", - client->monc.auth->global_id, PR_FSID(fsid)); - memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; - } + struct ceph_inode_info *ci = foo; + inode_init_once(&ci->vfs_inode); +} + +static int __init init_caches(void) +{ + int error = -ENOMEM; + + ceph_inode_cachep = kmem_cache_create("ceph_inode_info", + sizeof(struct ceph_inode_info), + __alignof__(struct ceph_inode_info), + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + ceph_inode_init_once); + if (ceph_inode_cachep == NULL) + return -ENOMEM; + + ceph_cap_cachep = KMEM_CACHE(ceph_cap, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_cachep == NULL) + goto bad_cap; + + ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_dentry_cachep == NULL) + goto bad_dentry; + + ceph_file_cachep = KMEM_CACHE(ceph_file_info, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_file_cachep == NULL) + goto bad_file; + + if ((error = ceph_fscache_register())) + goto bad_file; + return 0; +bad_file: + kmem_cache_destroy(ceph_dentry_cachep); +bad_dentry: + kmem_cache_destroy(ceph_cap_cachep); +bad_cap: + kmem_cache_destroy(ceph_inode_cachep); + return error; } +static void destroy_caches(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + + kmem_cache_destroy(ceph_inode_cachep); + kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_dentry_cachep); + kmem_cache_destroy(ceph_file_cachep); + + ceph_fscache_unregister(); +} + + /* - * true if we have the mon map (and have thus joined the cluster) + * ceph_umount_begin - initiate forced umount. Tear down down the + * mount, skipping steps that may hang while waiting for server(s). */ -static int have_mon_and_osd_map(struct ceph_client *client) +static void ceph_umount_begin(struct super_block *sb) { - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + dout("ceph_umount_begin - starting forced umount\n"); + if (!fsc) + return; + fsc->mount_state = CEPH_MOUNT_SHUTDOWN; + return; } +static const struct super_operations ceph_super_ops = { + .alloc_inode = ceph_alloc_inode, + .destroy_inode = ceph_destroy_inode, + .write_inode = ceph_write_inode, + .drop_inode = ceph_drop_inode, + .sync_fs = ceph_sync_fs, + .put_super = ceph_put_super, + .show_options = ceph_show_options, + .statfs = ceph_statfs, + .umount_begin = ceph_umount_begin, +}; + /* * Bootstrap mount by opening the root directory. Note the mount * @started time from caller, and time out if this takes too long. */ -static struct dentry *open_root_dentry(struct ceph_client *client, +static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, const char *path, unsigned long started) { - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req = NULL; int err; struct dentry *root; @@ -751,125 +739,113 @@ static struct dentry *open_root_dentry(struct ceph_client *client, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = client->mount_args->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout * HZ; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err == 0) { - dout("open_root_inode success\n"); - if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - client->sb->s_root == NULL) - root = d_alloc_root(req->r_target_inode); - else - root = d_obtain_alias(req->r_target_inode); + struct inode *inode = req->r_target_inode; req->r_target_inode = NULL; + dout("open_root_inode success\n"); + if (ceph_ino(inode) == CEPH_INO_ROOT && + fsc->sb->s_root == NULL) { + root = d_make_root(inode); + if (!root) { + root = ERR_PTR(-ENOMEM); + goto out; + } + } else { + root = d_obtain_alias(inode); + } + ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); } +out: ceph_mdsc_put_request(req); return root; } + + + /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path) { - struct ceph_entity_addr *myaddr = NULL; int err; - unsigned long timeout = client->mount_args->mount_timeout * HZ; unsigned long started = jiffies; /* note the start time */ struct dentry *root; + int first = 0; /* first vfsmount for this super_block */ dout("mount start\n"); - mutex_lock(&client->mount_mutex); - - /* initialize the messenger */ - if (client->msgr == NULL) { - if (ceph_test_opt(client, MYIP)) - myaddr = &client->mount_args->my_addr; - client->msgr = ceph_messenger_create(myaddr); - if (IS_ERR(client->msgr)) { - err = PTR_ERR(client->msgr); - client->msgr = NULL; - goto out; - } - client->msgr->nocrc = ceph_test_opt(client, NOCRC); - } + mutex_lock(&fsc->client->mount_mutex); - /* open session, and wait for mon, mds, and osd maps */ - err = ceph_monc_open_session(&client->monc); + err = __ceph_open_session(fsc->client, started); if (err < 0) goto out; - while (!have_mon_and_osd_map(client)) { - err = -EIO; - if (timeout && time_after_eq(jiffies, started + timeout)) - goto out; - - /* wait */ - dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - timeout); - if (err == -EINTR || err == -ERESTARTSYS) - goto out; - if (client->auth_err < 0) { - err = client->auth_err; - goto out; - } - } - dout("mount opening root\n"); - root = open_root_dentry(client, "", started); + root = open_root_dentry(fsc, "", started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } - if (client->sb->s_root) + if (fsc->sb->s_root) { dput(root); - else - client->sb->s_root = root; + } else { + fsc->sb->s_root = root; + first = 1; + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto fail; + } if (path[0] == 0) { dget(root); } else { dout("mount opening base mountpoint\n"); - root = open_root_dentry(client, path, started); + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); - dput(client->sb->s_root); - client->sb->s_root = NULL; - goto out; + goto fail; } } - mnt->mnt_root = root; - mnt->mnt_sb = client->sb; - - client->mount_state = CEPH_MOUNT_MOUNTED; + fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); - err = 0; + mutex_unlock(&fsc->client->mount_mutex); + return root; out: - mutex_unlock(&client->mount_mutex); - return err; + mutex_unlock(&fsc->client->mount_mutex); + return ERR_PTR(err); + +fail: + if (first) { + dput(fsc->sb->s_root); + fsc->sb->s_root = NULL; + } + goto out; } static int ceph_set_super(struct super_block *s, void *data) { - struct ceph_client *client = data; + struct ceph_fs_client *fsc = data; int ret; dout("set_super %p data %p\n", s, data); - s->s_flags = client->mount_args->sb_flags; + s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ - s->s_fs_info = client; - client->sb = s; + s->s_xattr = ceph_xattr_handlers; + s->s_fs_info = fsc; + fsc->sb = s; s->s_op = &ceph_super_ops; s->s_export_op = &ceph_export_ops; @@ -884,7 +860,7 @@ static int ceph_set_super(struct super_block *s, void *data) fail: s->s_fs_info = NULL; - client->sb = NULL; + fsc->sb = NULL; return ret; } @@ -893,30 +869,23 @@ fail: */ static int ceph_compare_super(struct super_block *sb, void *data) { - struct ceph_client *new = data; - struct ceph_mount_args *args = new->mount_args; - struct ceph_client *other = ceph_sb_to_client(sb); - int i; + struct ceph_fs_client *new = data; + struct ceph_mount_options *fsopt = new->mount_options; + struct ceph_options *opt = new->client->options; + struct ceph_fs_client *other = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (args->flags & CEPH_OPT_FSID) { - if (ceph_fsid_compare(&args->fsid, &other->fsid)) { - dout("fsid doesn't match\n"); - return 0; - } - } else { - /* do we share (a) monitor? */ - for (i = 0; i < new->monc.monmap->num_mon; i++) - if (ceph_monmap_contains(other->monc.monmap, - &new->monc.monmap->mon_inst[i].addr)) - break; - if (i == new->monc.monmap->num_mon) { - dout("mon ip not part of monmap\n"); - return 0; - } - dout("mon ip matches existing sb %p\n", sb); + + if (compare_mount_options(fsopt, opt, other)) { + dout("monitor(s)/mount options don't match\n"); + return 0; + } + if ((opt->flags & CEPH_OPT_FSID) && + ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + dout("fsid doesn't match\n"); + return 0; } - if (args->sb_flags != other->mount_args->sb_flags) { + if (fsopt->sb_flags != other->mount_options->sb_flags) { dout("flags differ\n"); return 0; } @@ -928,139 +897,148 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) +static int ceph_register_bdi(struct super_block *sb, + struct ceph_fs_client *fsc) { int err; - /* set ra_pages based on rsize mount option? */ - if (client->mount_args->rsize >= PAGE_CACHE_SIZE) - client->backing_dev_info.ra_pages = - (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) + /* set ra_pages based on rasize mount option? */ + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) + fsc->backing_dev_info.ra_pages = + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + else + fsc->backing_dev_info.ra_pages = + default_backing_dev_info.ra_pages; + + err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); if (!err) - sb->s_bdi = &client->backing_dev_info; + sb->s_bdi = &fsc->backing_dev_info; return err; } -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *ceph_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct super_block *sb; - struct ceph_client *client; + struct ceph_fs_client *fsc; + struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; - struct ceph_mount_args *args; + struct ceph_mount_options *fsopt = NULL; + struct ceph_options *opt = NULL; - dout("ceph_get_sb\n"); - args = parse_mount_args(flags, data, dev_name, &path); - if (IS_ERR(args)) { - err = PTR_ERR(args); + dout("ceph_mount\n"); + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + flags |= MS_POSIXACL; +#endif + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + if (err < 0) { + res = ERR_PTR(err); goto out_final; } /* create client (which we may/may not use) */ - client = ceph_create_client(args); - if (IS_ERR(client)) { - err = PTR_ERR(client); + fsc = create_fs_client(fsopt, opt); + if (IS_ERR(fsc)) { + res = ERR_CAST(fsc); + destroy_mount_options(fsopt); + ceph_destroy_options(opt); goto out_final; } - if (client->mount_args->flags & CEPH_OPT_NOSHARE) + err = ceph_mdsc_init(fsc); + if (err < 0) { + res = ERR_PTR(err); + goto out; + } + + if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, client); + sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); if (IS_ERR(sb)) { - err = PTR_ERR(sb); + res = ERR_CAST(sb); goto out; } - if (ceph_sb_to_client(sb) != client) { - ceph_destroy_client(client); - client = ceph_sb_to_client(sb); - dout("get_sb got existing client %p\n", client); + if (ceph_sb_to_client(sb) != fsc) { + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + fsc = ceph_sb_to_client(sb); + dout("get_sb got existing client %p\n", fsc); } else { - dout("get_sb using new client %p\n", client); - err = ceph_register_bdi(sb, client); - if (err < 0) + dout("get_sb using new client %p\n", fsc); + err = ceph_register_bdi(sb, fsc); + if (err < 0) { + res = ERR_PTR(err); goto out_splat; + } } - err = ceph_mount(client, mnt, path); - if (err < 0) + res = ceph_real_mount(fsc, path); + if (IS_ERR(res)) goto out_splat; - dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, - mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); - return 0; + dout("root %p inode %p ino %llx.%llx\n", res, + res->d_inode, ceph_vinop(res->d_inode)); + return res; out_splat: - ceph_mdsc_close_sessions(&client->mdsc); + ceph_mdsc_close_sessions(fsc->mdsc); deactivate_locked_super(sb); goto out_final; out: - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); out_final: - dout("ceph_get_sb fail %d\n", err); - return err; + dout("ceph_mount fail %ld\n", PTR_ERR(res)); + return res; } static void ceph_kill_sb(struct super_block *s) { - struct ceph_client *client = ceph_sb_to_client(s); + struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(&client->mdsc); + ceph_mdsc_pre_umount(fsc->mdsc); kill_anon_super(s); /* will call put_super after sb is r/o */ - ceph_destroy_client(client); + ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); } static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .get_sb = ceph_get_sb, + .mount = ceph_mount, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; +MODULE_ALIAS_FS("ceph"); #define _STRINGIFY(x) #x #define STRINGIFY(x) _STRINGIFY(x) static int __init init_ceph(void) { - int ret = 0; - - ret = ceph_debugfs_init(); - if (ret < 0) - goto out; - - ret = ceph_msgr_init(); - if (ret < 0) - goto out_debugfs; - - ret = init_caches(); + int ret = init_caches(); if (ret) - goto out_msgr; - - ceph_caps_init(); + goto out; + ceph_flock_init(); + ceph_xattr_init(); ret = register_filesystem(&ceph_fs_type); if (ret) goto out_icache; - pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, - CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, - CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); + pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); + return 0; out_icache: + ceph_xattr_exit(); destroy_caches(); -out_msgr: - ceph_msgr_exit(); -out_debugfs: - ceph_debugfs_cleanup(); out: return ret; } @@ -1069,10 +1047,8 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); - ceph_caps_finalize(); + ceph_xattr_exit(); destroy_caches(); - ceph_msgr_exit(); - ceph_debugfs_cleanup(); } module_init(init_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 10a4a406e88..12b20744e38 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1,7 +1,7 @@ #ifndef _FS_CEPH_SUPER_H #define _FS_CEPH_SUPER_H -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> #include <asm/unaligned.h> #include <linux/backing-dev.h> @@ -13,136 +13,73 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/slab.h> +#include <linux/posix_acl.h> -#include "types.h" -#include "messenger.h" -#include "msgpool.h" -#include "mon_client.h" -#include "mds_client.h" -#include "osd_client.h" -#include "ceph_fs.h" +#include <linux/ceph/libceph.h> + +#ifdef CONFIG_CEPH_FSCACHE +#include <linux/fscache.h> +#endif /* f_type in struct statfs */ #define CEPH_SUPER_MAGIC 0x00c36400 /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ -#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ +#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -/* - * mount options - */ -#define CEPH_OPT_FSID (1<<0) -#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ -#define CEPH_OPT_MYIP (1<<2) /* specified my ip */ -#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */ -#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ -#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */ -#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ +#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ +#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ +#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ +#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ +#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ -#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) -#define ceph_set_opt(client, opt) \ - (client)->mount_args->flags |= CEPH_OPT_##opt; -#define ceph_test_opt(client, opt) \ - (!!((client)->mount_args->flags & CEPH_OPT_##opt)) +#define ceph_set_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; +#define ceph_test_mount_opt(fsc, opt) \ + (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) +#define CEPH_RSIZE_DEFAULT 0 /* max read size */ +#define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) +#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -struct ceph_mount_args { - int sb_flags; +struct ceph_mount_options { int flags; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int num_mon; - struct ceph_entity_addr *mon_addr; - int mount_timeout; - int osd_idle_ttl; - int osd_timeout; - int osd_keepalive_timeout; - int wsize; - int rsize; /* max readahead */ + int sb_flags; + + int wsize; /* max write size */ + int rsize; /* max read size */ + int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ - char *snapdir_name; /* default ".snap" */ - char *name; - char *secret; -}; - -/* - * defaults - */ -#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 -#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ -#define CEPH_OSD_KEEPALIVE_DEFAULT 5 -#define CEPH_OSD_IDLE_TTL_DEFAULT 60 -#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ -#define CEPH_MAX_READDIR_DEFAULT 1024 -#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) - -#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) -#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) - -#define CEPH_SNAPDIRNAME_DEFAULT ".snap" -#define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - -/* mount state */ -enum { - CEPH_MOUNT_MOUNTING, - CEPH_MOUNT_MOUNTED, - CEPH_MOUNT_UNMOUNTING, - CEPH_MOUNT_UNMOUNTED, - CEPH_MOUNT_SHUTDOWN, -}; -/* - * subtract jiffies - */ -static inline unsigned long time_sub(unsigned long a, unsigned long b) -{ - BUG_ON(time_after(b, a)); - return (long)a - (long)b; -} - -/* - * per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_client { - struct ceph_fsid fsid; - bool have_fsid; + /* + * everything above this point can be memcmp'd; everything below + * is handled in compare_mount_options() + */ - struct mutex mount_mutex; /* serialize mount attempts */ - struct ceph_mount_args *mount_args; + char *snapdir_name; /* default ".snap" */ +}; +struct ceph_fs_client { struct super_block *sb; - unsigned long mount_state; - wait_queue_head_t auth_wq; - - int auth_err; + struct ceph_mount_options *mount_options; + struct ceph_client *client; + unsigned long mount_state; int min_caps; /* min caps i added */ - struct ceph_messenger *msgr; /* messenger instance */ - struct ceph_mon_client monc; - struct ceph_mds_client mdsc; - struct ceph_osd_client osdc; + struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; @@ -154,14 +91,19 @@ struct ceph_client { struct backing_dev_info backing_dev_info; #ifdef CONFIG_DEBUG_FS - struct dentry *debugfs_monmap; - struct dentry *debugfs_mdsmap, *debugfs_osdmap; - struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps; + struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; + struct dentry *debugfs_mdsc, *debugfs_mdsmap; +#endif + +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + struct workqueue_struct *revalidate_wq; #endif }; + /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -206,12 +148,11 @@ struct ceph_cap_snap { int issued, dirty; struct ceph_snap_context *context; - mode_t mode; - uid_t uid; - gid_t gid; + umode_t mode; + kuid_t uid; + kgid_t gid; - void *xattr_blob; - int xattr_len; + struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; @@ -223,8 +164,11 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); + } } /* @@ -267,6 +211,20 @@ struct ceph_inode_xattr { int should_free_val; }; +/* + * Ceph dentry state + */ +struct ceph_dentry_info { + struct ceph_mds_session *lease_session; + u32 lease_gen, lease_shared_gen; + u32 lease_seq; + unsigned long lease_renew_after, lease_renew_from; + struct list_head lru; + struct dentry *dentry; + u64 time; + u64 offset; +}; + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -288,20 +246,19 @@ struct ceph_inode_xattrs_info { /* * Ceph inode. */ -#define CEPH_I_COMPLETE 1 /* we have complete directory cached */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ - struct ceph_inode_info { struct ceph_vino i_vino; /* ceph ino + snap */ + spinlock_t i_ceph_lock; + u64 i_version; u32 i_time_warp_seq; unsigned i_ceph_flags; - unsigned long i_release_count; + atomic_t i_release_count; + atomic_t i_complete_count; + struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; char *i_symlink; @@ -309,14 +266,13 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; - /* capabilities. protected _both_ by i_lock and cap->session's + /* capabilities. protected _both_ by i_ceph_lock and cap->session's * s_mutex. */ struct rb_root i_caps; /* cap list */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ @@ -331,16 +287,15 @@ struct ceph_inode_info { unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ - int i_cap_exporting_mds; /* to handle cap migration between */ - unsigned i_cap_exporting_mseq; /* mds's. */ - unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ - struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ + struct mutex i_truncate_mutex; u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ @@ -352,12 +307,10 @@ struct ceph_inode_info { /* held references to caps */ int i_pin_ref; - int i_rd_ref, i_rdcache_ref, i_wr_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ @@ -374,6 +327,11 @@ struct ceph_inode_info { struct work_struct i_vmtruncate_work; +#ifdef CONFIG_CEPH_FSCACHE + struct fscache_cookie *fscache; + u32 i_fscache_gen; /* sequence, for delayed fscache validate */ + struct work_struct i_revalidate_work; +#endif struct inode vfs_inode; /* at end */ }; @@ -382,98 +340,67 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) return container_of(inode, struct ceph_inode_info, vfs_inode); } -static inline void ceph_i_clear(struct inode *inode, unsigned mask) +static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&inode->i_lock); - ci->i_ceph_flags &= ~mask; - spin_unlock(&inode->i_lock); + return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline void ceph_i_set(struct inode *inode, unsigned mask) +static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) { - struct ceph_inode_info *ci = ceph_inode(inode); - - spin_lock(&inode->i_lock); - ci->i_ceph_flags |= mask; - spin_unlock(&inode->i_lock); + return (struct ceph_fs_client *)sb->s_fs_info; } -static inline bool ceph_i_test(struct inode *inode, unsigned mask) +static inline struct ceph_vino ceph_vino(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool r; - - smp_mb(); - r = (ci->i_ceph_flags & mask) == mask; - return r; + return ceph_inode(inode)->i_vino; } - -/* find a specific frag @f */ -extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, - u32 f); - /* - * choose fragment for value @v. copy frag content to pfrag, if leaf - * exists - */ -extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, - struct ceph_inode_frag *pfrag, - int *found); - -/* - * Ceph dentry state + * ino_t is <64 bits on many architectures, blech. + * + * i_ino (kernel inode) st_ino (userspace) + * i386 32 32 + * x86_64+ino32 64 32 + * x86_64 64 64 */ -struct ceph_dentry_info { - struct ceph_mds_session *lease_session; - u32 lease_gen, lease_shared_gen; - u32 lease_seq; - unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; - u64 time; - u64 offset; -}; - -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) -{ - return (struct ceph_dentry_info *)dentry->d_fsdata; -} - -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +static inline u32 ceph_ino_to_ino32(__u64 vino) { - return ((loff_t)frag << 32) | (loff_t)off; + u32 ino = vino & 0xffffffff; + ino ^= vino >> 32; + if (!ino) + ino = 2; + return ino; } /* - * ino_t is <64 bits on many architectures, blech. - * - * don't include snap in ino hash, at least for now. + * kernel i_ino value */ static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) { - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ #if BITS_PER_LONG == 32 - ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; - if (!ino) - ino = 1; + return ceph_ino_to_ino32(vino.ino); +#else + return (ino_t)vino.ino; #endif - return ino; } -static inline int ceph_set_ino_cb(struct inode *inode, void *data) +/* + * user-visible ino (stat, filldir) + */ +#if BITS_PER_LONG == 32 +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) { - ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; - inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); - return 0; + return ino; } - -static inline struct ceph_vino ceph_vino(struct inode *inode) +#else +static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) { - return ceph_inode(inode)->i_vino; + if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) + ino = ceph_ino_to_ino32(ino); + return ino; } +#endif + /* for printf-style formatting */ #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap @@ -504,6 +431,63 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* + * Ceph inode. + */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ + +static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, + int release_count) +{ + atomic_set(&ci->i_complete_count, release_count); +} + +static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) +{ + atomic_inc(&ci->i_release_count); +} + +static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) +{ + return atomic_read(&ci->i_complete_count) == + atomic_read(&ci->i_release_count); +} + +static inline void ceph_dir_clear_complete(struct inode *inode) +{ + __ceph_dir_clear_complete(ceph_inode(inode)); +} + +static inline bool ceph_dir_is_complete(struct inode *inode) +{ + return __ceph_dir_is_complete(ceph_inode(inode)); +} + + +/* find a specific frag @f */ +extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, + u32 f); + +/* + * choose fragment for value @v. copy frag content to pfrag, if leaf + * exists + */ +extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, + struct ceph_inode_frag *pfrag, + int *found); + +static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +{ + return (struct ceph_dentry_info *)dentry->d_fsdata; +} + +static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) +{ + return ((loff_t)frag << 32) | (loff_t)off; +} + +/* * caps helpers */ static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci) @@ -519,9 +503,9 @@ extern int __ceph_caps_issued_other(struct ceph_inode_info *ci, static inline int ceph_caps_issued(struct ceph_inode_info *ci) { int issued; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return issued; } @@ -529,9 +513,9 @@ static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { int r; - spin_lock(&ci->vfs_inode.i_lock); + spin_lock(&ci->i_ceph_lock); r = __ceph_caps_issued_mask(ci, mask, touch); - spin_unlock(&ci->vfs_inode.i_lock); + spin_unlock(&ci->i_ceph_lock); return r; } @@ -539,8 +523,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, + struct ceph_cap *ocap, int mask); extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); @@ -560,43 +546,39 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); -extern void ceph_caps_init(void); -extern void ceph_caps_finalize(void); -extern void ceph_adjust_min_caps(int delta); -extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); -extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); -extern void ceph_reservation_status(struct ceph_client *client, +extern void ceph_caps_init(struct ceph_mds_client *mdsc); +extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); +extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern void ceph_reserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx, int need); +extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, int *reserved, int *min); -static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) -{ - return (struct ceph_client *)inode->i_sb->s_fs_info; -} - -static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) -{ - return (struct ceph_client *)sb->s_fs_info; -} /* * we keep buffered readdir results attached to file->private_data */ +#define CEPH_F_SYNC 1 +#define CEPH_F_ATEND 2 + struct ceph_file_info { - int fmode; /* initialized on open */ + short fmode; /* initialized on open */ + short flags; /* CEPH_F_* */ /* readdir: position within the dir */ u32 frag; struct ceph_mds_request *last_readdir; - int at_end; /* readdir: position within a frag */ unsigned offset; /* offset of last chunk, adjusted for . and .. */ - u64 next_offset; /* offset of next chunk (last_name's + 1) */ + unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ - unsigned long dir_release_count; + int dir_release_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -606,51 +588,6 @@ struct ceph_file_info { /* - * snapshots - */ - -/* - * A "snap context" is the set of existing snapshots when we - * write data. It is used by the OSD to guide its COW behavior. - * - * The ceph_snap_context is refcounted, and attached to each dirty - * page, indicating which context the dirty data belonged when it was - * dirtied. - */ -struct ceph_snap_context { - atomic_t nref; - u64 seq; - int num_snaps; - u64 snaps[]; -}; - -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} - -/* * A "snap realm" describes a subset of the file hierarchy sharing * the same set of snapshots that apply to it. The realms themselves * are organized into a hierarchy, such that children inherit (some of) @@ -669,9 +606,9 @@ struct ceph_snap_realm { u64 parent_since; /* snapid when our current parent became so */ u64 *prior_parent_snaps; /* snaps inherited from any parents we */ - int num_prior_parent_snaps; /* had prior to parent_since */ + u32 num_prior_parent_snaps; /* had prior to parent_since */ u64 *snaps; /* snaps specific to this realm */ - int num_snaps; + u32 num_snaps; struct ceph_snap_realm *parent; struct list_head children; /* list of child realms */ @@ -679,6 +616,8 @@ struct ceph_snap_realm { struct list_head empty_item; /* if i have ref==0 */ + struct list_head dirty_item; /* if realm needs new context */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; @@ -686,16 +625,33 @@ struct ceph_snap_realm { spinlock_t inodes_with_caps_lock; }; - - -/* - * calculate the number of pages a given length and offset map onto, - * if we align the data. - */ -static inline int calc_pages_for(u64 off, u64 len) +static inline int default_congestion_kb(void) { - return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - - (off >> PAGE_CACHE_SHIFT); + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; } @@ -728,28 +684,12 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) ci_item)->writing; } - -/* super.c */ -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; - -extern const char *ceph_msg_type_name(int type); -extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); - -#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \ - "%02x%02x%02x%02x%02x%02x" -#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \ - (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \ - (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \ - (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15] - /* inode.c */ extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_destroy_inode(struct inode *inode); +extern int ceph_drop_inode(struct inode *inode); extern struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino); @@ -783,43 +723,87 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, /* xattr.c */ extern int ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int); +ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); +int __ceph_removexattr(struct dentry *, const char *); extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern int ceph_removexattr(struct dentry *, const char *); extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); +extern void __init ceph_xattr_init(void); +extern void ceph_xattr_exit(void); + +/* acl.c */ +extern const struct xattr_handler *ceph_xattr_handlers[]; + +#ifdef CONFIG_CEPH_FS_POSIX_ACL + +struct posix_acl *ceph_get_acl(struct inode *, int); +int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_init_acl(struct dentry *, struct inode *, struct inode *); + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ + forget_all_cached_acls(inode); +} + +#else + +#define ceph_get_acl NULL +#define ceph_set_acl NULL + +static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +static inline void ceph_forget_all_cached_acls(struct inode *inode) +{ +} + +#endif /* caps.c */ extern const char *ceph_cap_string(int c); extern void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg); -extern int ceph_add_cap(struct inode *inode, - struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, - unsigned cap, unsigned seq, u64 realmino, int flags, - struct ceph_cap_reservation *caps_reservation); -extern void __ceph_remove_cap(struct ceph_cap *cap); -static inline void ceph_remove_cap(struct ceph_cap *cap) -{ - struct inode *inode = &cap->ci->vfs_inode; - spin_lock(&inode->i_lock); - __ceph_remove_cap(cap); - spin_unlock(&inode->i_lock); -} -extern void ceph_put_cap(struct ceph_cap *cap); - +extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, + struct ceph_cap_reservation *ctx); +extern void ceph_add_cap(struct inode *inode, + struct ceph_mds_session *session, u64 cap_id, + int fmode, unsigned issued, unsigned wanted, + unsigned cap, unsigned seq, u64 realmino, int flags, + struct ceph_cap **new_cap); +extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_put_cap(struct ceph_mds_client *mdsc, + struct ceph_cap *cap); +extern int ceph_is_any_caps(struct inode *inode); + +extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, + u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int ceph_fsync(struct file *file, int datasync); +extern int ceph_fsync(struct file *file, loff_t start, loff_t end, + int datasync); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, + int mds); extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void __ceph_flush_snaps(struct ceph_inode_info *ci, - struct ceph_mds_session **psession); + struct ceph_mds_session **psession, + int again); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); @@ -847,20 +831,22 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); /* file.c */ extern const struct file_operations ceph_file_fops; extern const struct address_space_operations ceph_aops; + extern int ceph_open(struct inode *inode, struct file *file); -extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, - struct nameidata *nd, int mode, - int locked_dir); +extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, umode_t mode, + int *opened); extern int ceph_release(struct inode *inode, struct file *filp); -extern void ceph_release_page_vector(struct page **pages, int num_pages); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct inode_operations ceph_dir_iops; -extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, +extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); @@ -868,6 +854,8 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -882,18 +870,22 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* export.c */ extern const struct export_operations ceph_export_ops; -/* debugfs.c */ -extern int ceph_debugfs_init(void); -extern void ceph_debugfs_cleanup(void); -extern int ceph_debugfs_client_init(struct ceph_client *client); -extern void ceph_debugfs_client_cleanup(struct ceph_client *client); +/* locks.c */ +extern __init void ceph_flock_init(void); +extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); +extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); +extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); +extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} +/* debugfs.c */ +extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); +extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/types.h b/fs/ceph/types.h deleted file mode 100644 index 28b35a005ec..00000000000 --- a/fs/ceph/types.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _FS_CEPH_TYPES_H -#define _FS_CEPH_TYPES_H - -/* needed before including ceph_fs.h */ -#include <linux/in.h> -#include <linux/types.h> -#include <linux/fcntl.h> -#include <linux/string.h> - -#include "ceph_fs.h" -#include "ceph_frag.h" -#include "ceph_hash.h" - -/* - * Identify inodes by both their ino AND snapshot id (a u64). - */ -struct ceph_vino { - u64 ino; - u64 snap; -}; - - -/* context for the caps reservation mechanism */ -struct ceph_cap_reservation { - int count; -}; - - -#endif diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 68aeebc6968..c9c2b887381 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,15 +1,38 @@ -#include "ceph_debug.h" +#include <linux/ceph/ceph_debug.h> + #include "super.h" -#include "decode.h" +#include "mds_client.h" + +#include <linux/ceph/decode.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/slab.h> +#define XATTR_CEPH_PREFIX "ceph." +#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) + +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + +/* + * List of handlers for synthetic system.* attributes. Other + * attributes are handled directly. + */ +const struct xattr_handler *ceph_xattr_handlers[] = { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL, +}; + static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, "ceph.", 5) || + return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -18,101 +41,235 @@ static bool ceph_is_valid_xattr(const char *name) * These define virtual xattrs exposing the recursive directory * statistics and layout metadata. */ -struct ceph_vxattr_cb { - bool readonly; +struct ceph_vxattr { char *name; + size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t size); + bool readonly, hidden; + bool (*exists_cb)(struct ceph_inode_info *ci); }; +/* layouts */ + +static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci) +{ + size_t s; + char *p = (char *)&ci->i_layout; + + for (s = 0; s < sizeof(ci->i_layout); s++, p++) + if (*p) + return true; + return false; +} + +static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + char buf[128]; + + dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) { + size_t len = strlen(pool_name); + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); + if (!size) { + ret += len; + } else if (ret + len > size) { + ret = -ERANGE; + } else { + memcpy(val, buf, ret); + memcpy(val + ret, pool_name, len); + ret += len; + } + } else { + ret = snprintf(buf, sizeof(buf), + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout), + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), + (unsigned long long)ceph_file_layout_object_size(ci->i_layout), + (unsigned long long)pool); + if (size) { + if (ret <= size) + memcpy(val, buf, ret); + else + ret = -ERANGE; + } + } + up_read(&osdc->map_sem); + return ret; +} + +static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_su(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci, + char *val, size_t size) +{ + return snprintf(val, size, "%lld", + (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); +} + +static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + struct ceph_osd_client *osdc = &fsc->client->osdc; + s64 pool = ceph_file_layout_pg_pool(ci->i_layout); + const char *pool_name; + + down_read(&osdc->map_sem); + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); + if (pool_name) + ret = snprintf(val, size, "%s", pool_name); + else + ret = snprintf(val, size, "%lld", (unsigned long long)pool); + up_read(&osdc->map_sem); + return ret; +} + /* directories */ -static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs); } -static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_files(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_files); } -static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_subdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_subdirs); } -static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rentries(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rfiles(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rfiles); } -static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rsubdirs); } -static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { return snprintf(val, size, "%lld", ci->i_rbytes); } -static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, +static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec, + return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, (long)ci->i_rctime.tv_nsec); } -static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "ceph.dir.files", ceph_vxattrcb_files}, - { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, - { true, NULL, NULL } -}; -/* files */ +#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) \ + XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 -static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, - size_t size) -{ - int ret; +#define XATTR_NAME_CEPH(_type, _name) \ + { \ + .name = CEPH_XATTR_NAME(_type, _name), \ + .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ + .readonly = true, \ + .hidden = false, \ + .exists_cb = NULL, \ + } +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ + { \ + .name = CEPH_XATTR_NAME2(_type, _name, _field), \ + .name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \ + .getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \ + .readonly = false, \ + .hidden = true, \ + .exists_cb = ceph_vxattrcb_layout_exists, \ + } - ret = snprintf(val, size, - "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n", - (unsigned long long)ceph_file_layout_su(ci->i_layout), - (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout), - (unsigned long long)ceph_file_layout_object_size(ci->i_layout)); - if (ceph_file_layout_pg_preferred(ci->i_layout)) - ret += snprintf(val + ret, size, "preferred_osd=%lld\n", - (unsigned long long)ceph_file_layout_pg_preferred( - ci->i_layout)); - return ret; -} +static struct ceph_vxattr ceph_dir_vxattrs[] = { + { + .name = "ceph.dir.layout", + .name_size = sizeof("ceph.dir.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { .name = NULL, 0 } /* Required table terminator */ +}; +static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ + +/* files */ -static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "ceph.layout", ceph_vxattrcb_layout}, - { NULL, NULL } +static struct ceph_vxattr ceph_file_vxattrs[] = { + { + .name = "ceph.file.layout", + .name_size = sizeof("ceph.file.layout"), + .getxattr_cb = ceph_vxattrcb_layout, + .readonly = false, + .hidden = true, + .exists_cb = ceph_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + { .name = NULL, 0 } /* Required table terminator */ }; +static size_t ceph_file_vxattrs_name_size; /* total size of all names */ -static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) +static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode) { if (S_ISDIR(inode->i_mode)) return ceph_dir_vxattrs; @@ -121,22 +278,67 @@ static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) return NULL; } -static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr, +static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + if (vxattrs == ceph_dir_vxattrs) + return ceph_dir_vxattrs_name_size; + if (vxattrs == ceph_file_vxattrs) + return ceph_file_vxattrs_name_size; + BUG(); + + return 0; +} + +/* + * Compute the aggregate size (including terminating '\0') of all + * virtual extended attribute names in the given vxattr table. + */ +static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) +{ + struct ceph_vxattr *vxattr; + size_t size = 0; + + for (vxattr = vxattrs; vxattr->name; vxattr++) + if (!vxattr->hidden) + size += vxattr->name_size; + + return size; +} + +/* Routines called at initialization and exit time */ + +void __init ceph_xattr_init(void) +{ + ceph_dir_vxattrs_name_size = vxattrs_name_size(ceph_dir_vxattrs); + ceph_file_vxattrs_name_size = vxattrs_name_size(ceph_file_vxattrs); +} + +void ceph_xattr_exit(void) +{ + ceph_dir_vxattrs_name_size = 0; + ceph_file_vxattrs_name_size = 0; +} + +static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, const char *name) { - do { - if (strcmp(vxattr->name, name) == 0) - return vxattr; - vxattr++; - } while (vxattr->name); + struct ceph_vxattr *vxattr = ceph_inode_vxattrs(inode); + + if (vxattr) { + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + } + return NULL; } static int __set_xattr(struct ceph_inode_info *ci, const char *name, int name_len, const char *val, int val_len, - int dirty, - int should_free_name, int should_free_val, + int flags, int update_xattr, struct ceph_inode_xattr **newxattr) { struct rb_node **p; @@ -165,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr = NULL; } + if (update_xattr) { + int err = 0; + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) + err = -ENODATA; + if (err) { + kfree(name); + kfree(val); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } + } + if (!xattr) { new = 1; xattr = *newxattr; xattr->name = name; xattr->name_len = name_len; - xattr->should_free_name = should_free_name; + xattr->should_free_name = update_xattr; ci->i_xattrs.count++; dout("__set_xattr count=%d\n", ci->i_xattrs.count); @@ -180,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (xattr->should_free_val) kfree((void *)xattr->val); - if (should_free_name) { + if (update_xattr) { kfree((void *)name); name = xattr->name; } @@ -195,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci, xattr->val = ""; xattr->val_len = val_len; - xattr->dirty = dirty; - xattr->should_free_val = (val && should_free_val); + xattr->dirty = update_xattr; + xattr->should_free_val = (val && update_xattr); if (new) { rb_link_node(&xattr->node, parent, p); @@ -216,6 +437,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, struct rb_node **p; struct rb_node *parent = NULL; struct ceph_inode_xattr *xattr = NULL; + int name_len = strlen(name); int c; p = &ci->i_xattrs.index.rb_node; @@ -223,6 +445,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci, parent = *p; xattr = rb_entry(parent, struct ceph_inode_xattr, node); c = strncmp(name, xattr->name, xattr->name_len); + if (c == 0 && name_len > xattr->name_len) + c = 1; if (c < 0) p = &(*p)->rb_left; else if (c > 0) @@ -255,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr) { if (!xattr) - return -EOPNOTSUPP; + return -ENODATA; rb_erase(&xattr->node, &ci->i_xattrs.index); @@ -337,6 +561,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci) } static int __build_xattrs(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { u32 namelen; u32 numattr = 0; @@ -364,7 +590,7 @@ start: end = p + ci->i_xattrs.blob->vec.iov_len; ceph_decode_32_safe(&p, end, numattr, bad); xattr_version = ci->i_xattrs.version; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), GFP_NOFS); @@ -379,12 +605,13 @@ start: goto bad_lock; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.version != xattr_version) { /* lost a race, retry */ for (i = 0; i < numattr; i++) kfree(xattrs[i]); kfree(xattrs); + xattrs = NULL; goto start; } err = -EIO; @@ -398,7 +625,7 @@ start: p += len; err = __set_xattr(ci, name, namelen, val, len, - 0, 0, 0, &xattrs[numattr]); + 0, 0, &xattrs[numattr]); if (err < 0) goto bad; @@ -410,7 +637,7 @@ start: return err; bad_lock: - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); bad: if (xattrs) { for (i = 0; i < numattr; i++) @@ -483,27 +710,29 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; } } -ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, +ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int err; struct ceph_inode_xattr *xattr; - struct ceph_vxattr_cb *vxattr = NULL; + struct ceph_vxattr *vxattr = NULL; if (!ceph_is_valid_xattr(name)) return -ENODATA; /* let's see if a virtual xattr was requested */ - if (vxattrs) - vxattr = ceph_match_vxattr(vxattrs, name); + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { + err = vxattr->getxattr_cb(ci, value, size); + return err; + } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -511,19 +740,14 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto get_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); /* get xattrs from mds (if we don't already have them) */ err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); - - if (vxattr && vxattr->readonly) { - err = vxattr->getxattr_cb(ci, value, size); - goto out; - } + spin_lock(&ci->i_ceph_lock); err = __build_xattrs(inode); if (err < 0) @@ -532,11 +756,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); - if (!xattr) { - if (vxattr) - err = vxattr->getxattr_cb(ci, value, size); + if (!xattr) goto out; - } err = -ERANGE; if (size && size < xattr->val_len) @@ -549,22 +770,31 @@ get_xattr: memcpy(value, xattr->val, xattr->val_len); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } +ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, value, size); + + return __ceph_getxattr(dentry->d_inode, name, value, size); +} + ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; u32 namelen; int err; u32 len; int i; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); @@ -572,57 +802,64 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); if (err) return err; } - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); err = __build_xattrs(inode); if (err < 0) goto out; list_xattr: - vir_namelen = 0; - /* include virtual dir xattrs */ - if (vxattrs) - for (i = 0; vxattrs[i].name; i++) - vir_namelen += strlen(vxattrs[i].name) + 1; + /* + * Start with virtual dir xattr names (if any) (including + * terminating '\0' characters for each). + */ + vir_namelen = ceph_vxattrs_name_size(vxattrs); + /* adding 1 byte per each variable due to the null termination */ - namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; + namelen = ci->i_xattrs.names_size + ci->i_xattrs.count; err = -ERANGE; - if (size && namelen > size) + if (size && vir_namelen + namelen > size) goto out; - err = namelen; + err = namelen + vir_namelen; if (size == 0) goto out; names = __copy_xattr_names(ci, names); /* virtual xattr names, too */ - if (vxattrs) + err = namelen; + if (vxattrs) { for (i = 0; vxattrs[i].name; i++) { - len = sprintf(names, "%s", vxattrs[i].name); - names += len + 1; + if (!vxattrs[i].hidden && + !(vxattrs[i].exists_cb && + !vxattrs[i].exists_cb(ci))) { + len = sprintf(names, "%s", vxattrs[i].name); + names += len + 1; + err += len + 1; + } } + } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); return err; } static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; int err; int i, nr_pages; struct page **pages = NULL; @@ -649,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("setxattr value=%.*s\n", (int)size, value); + if (!value) + flags |= CEPH_XATTR_REMOVE; + /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -656,7 +896,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -667,7 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -680,56 +921,53 @@ out: return err; } -int ceph_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); + int issued; int err; + int dirty = 0; int name_len = strlen(name); int val_len = size; char *newname = NULL; char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; - int issued; int required_blob_size; - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; - if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; /* preallocate memory for xattr name, value, index node */ err = -ENOMEM; - newname = kmalloc(name_len + 1, GFP_NOFS); + newname = kmemdup(name, name_len + 1, GFP_NOFS); if (!newname) goto out; - memcpy(newname, name, name_len + 1); if (val_len) { - newval = kmalloc(val_len + 1, GFP_NOFS); + newval = kmemdup(value, val_len, GFP_NOFS); if (!newval) goto out; - memcpy(newval, value, val_len); - newval[val_len] = '\0'; } xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS); if (!xattr) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -738,32 +976,37 @@ retry: if (!ci->i_xattrs.prealloc_blob || required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { - struct ceph_buffer *blob = NULL; + struct ceph_buffer *blob; - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) goto out; - spin_lock(&inode->i_lock); + spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ci->i_xattrs.prealloc_blob = blob; goto retry; } - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); - err = __set_xattr(ci, newname, name_len, newval, - val_len, 1, 1, 1, &xattr); - __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); - ci->i_xattrs.dirty = true; - inode->i_ctime = CURRENT_TIME; - spin_unlock(&inode->i_lock); + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); + + if (!err) { + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + ci->i_xattrs.dirty = true; + inode->i_ctime = CURRENT_TIME; + } + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_sync_setxattr(dentry, name, value, size, flags); out: kfree(newname); @@ -772,12 +1015,23 @@ out: return err; } +int ceph_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + return __ceph_setxattr(dentry, name, value, size, flags); +} + static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = &client->mdsc; + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; struct ceph_mds_request *req; int err; @@ -785,56 +1039,90 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; } -int ceph_removexattr(struct dentry *dentry, const char *name) +int __ceph_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; + struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); int issued; int err; - - if (ceph_snap(inode) != CEPH_NOSNAP) - return -EROFS; + int required_blob_size; + int dirty; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - if (vxattrs) { - struct ceph_vxattr_cb *vxattr = - ceph_match_vxattr(vxattrs, name); - if (vxattr && vxattr->readonly) - return -EOPNOTSUPP; - } + vxattr = ceph_match_vxattr(inode, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; - spin_lock(&inode->i_lock); - __build_xattrs(inode); + /* pass any unhandled ceph.* xattrs through to the MDS */ + if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto do_sync_unlocked; + + err = -ENOMEM; + spin_lock(&ci->i_ceph_lock); +retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); if (!(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + __build_xattrs(inode); + + required_blob_size = __get_required_blob_size(ci, 0, 0); + + if (!ci->i_xattrs.prealloc_blob || + required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) { + struct ceph_buffer *blob; + + spin_unlock(&ci->i_ceph_lock); + dout(" preaallocating new blob size=%d\n", required_blob_size); + blob = ceph_buffer_new(required_blob_size, GFP_NOFS); + if (!blob) + goto out; + spin_lock(&ci->i_ceph_lock); + if (ci->i_xattrs.prealloc_blob) + ceph_buffer_put(ci->i_xattrs.prealloc_blob); + ci->i_xattrs.prealloc_blob = blob; + goto retry; + } err = __remove_xattr_by_name(ceph_inode(inode), name); - __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; - - spin_unlock(&inode->i_lock); - + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); return err; do_sync: - spin_unlock(&inode->i_lock); + spin_unlock(&ci->i_ceph_lock); +do_sync_unlocked: err = ceph_send_removexattr(dentry, name); +out: return err; } +int ceph_removexattr(struct dentry *dentry, const char *name) +{ + if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + return -EROFS; + + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + return __ceph_removexattr(dentry, name); +} |
