diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-19 09:43:06 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-19 09:43:06 -0700 |
commit | fc7f99cf36ebae853639dabb43bc2f0098c59aef (patch) | |
tree | 3ca7050397f515f91ef98f8b6293f9f7fd84ef02 /fs | |
parent | 0a492fdef8aa241f6139e6455e852cc710ae8ed1 (diff) | |
parent | f1a3d57213fe264b4cf584e78bac36aaf9998729 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits)
ceph: update for write_inode API change
ceph: reset osd after relevant messages timed out
ceph: fix flush_dirty_caps race with caps migration
ceph: include migrating caps in issued set
ceph: fix osdmap decoding when pools include (removed) snaps
ceph: return EBADF if waiting for caps on closed file
ceph: set osd request message front length correctly
ceph: reset front len on return to msgpool; BUG on mismatched front iov
ceph: fix snaptrace decoding on cap migration between mds
ceph: use single osd op reply msg
ceph: reset bits on connection close
ceph: remove bogus mds forward warning
ceph: remove fragile __map_osds optimization
ceph: fix connection fault STANDBY check
ceph: invalidate_authorizer without con->mutex held
ceph: don't clobber write return value when using O_SYNC
ceph: fix client_request_forward decoding
ceph: drop messages on unregistered mds sessions; cleanup
ceph: fix comments, locking in destroy_inode
ceph: move dereference after NULL test
...
Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
Diffstat (limited to 'fs')
64 files changed, 27917 insertions, 0 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 7405f071be6..5f85b594761 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -235,6 +235,7 @@ config NFS_COMMON source "net/sunrpc/Kconfig" source "fs/smbfs/Kconfig" +source "fs/ceph/Kconfig" source "fs/cifs/Kconfig" source "fs/ncpfs/Kconfig" source "fs/coda/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index c3633aa4691..97f340f14ba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_EXOFS_FS) += exofs/ +obj-$(CONFIG_CEPH_FS) += ceph/ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig new file mode 100644 index 00000000000..04b8280582a --- /dev/null +++ b/fs/ceph/Kconfig @@ -0,0 +1,27 @@ +config CEPH_FS + tristate "Ceph distributed file system (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + select LIBCRC32C + select CONFIG_CRYPTO_AES + help + Choose Y or M here to include support for mounting the + experimental Ceph distributed file system. Ceph is an extremely + scalable file system designed to provide high performance, + reliable access to petabytes of storage. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_FS_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_FS + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This icnreases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile new file mode 100644 index 00000000000..6a660e610be --- /dev/null +++ b/fs/ceph/Makefile @@ -0,0 +1,39 @@ +# +# Makefile for CEPH filesystem. +# + +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_CEPH_FS) += ceph.o + +ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ + export.o caps.o snap.o xattr.o \ + messenger.o msgpool.o buffer.o pagelist.o \ + mds_client.o mdsmap.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o + +else +#Otherwise we were called directly from the command +# line; invoke the kernel build system. + +KERNELDIR ?= /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +default: all + +all: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install + +clean: + $(MAKE) -C $(KERNELDIR) M=$(PWD) clean + +endif diff --git a/fs/ceph/README b/fs/ceph/README new file mode 100644 index 00000000000..18352fab37c --- /dev/null +++ b/fs/ceph/README @@ -0,0 +1,20 @@ +# +# The following files are shared by (and manually synchronized +# between) the Ceph userland and kernel client. +# +# userland kernel +src/include/ceph_fs.h fs/ceph/ceph_fs.h +src/include/ceph_fs.cc fs/ceph/ceph_fs.c +src/include/msgr.h fs/ceph/msgr.h +src/include/rados.h fs/ceph/rados.h +src/include/ceph_strings.cc fs/ceph/ceph_strings.c +src/include/ceph_frag.h fs/ceph/ceph_frag.h +src/include/ceph_frag.cc fs/ceph/ceph_frag.c +src/include/ceph_hash.h fs/ceph/ceph_hash.h +src/include/ceph_hash.cc fs/ceph/ceph_hash.c +src/crush/crush.c fs/ceph/crush/crush.c +src/crush/crush.h fs/ceph/crush/crush.h +src/crush/mapper.c fs/ceph/crush/mapper.c +src/crush/mapper.h fs/ceph/crush/mapper.h +src/crush/hash.h fs/ceph/crush/hash.h +src/crush/hash.c fs/ceph/crush/hash.c diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c new file mode 100644 index 00000000000..23bb0ceabe3 --- /dev/null +++ b/fs/ceph/addr.c @@ -0,0 +1,1188 @@ +#include "ceph_debug.h" + +#include <linux/backing-dev.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> /* generic_writepages */ +#include <linux/pagevec.h> +#include <linux/task_io_accounting_ops.h> + +#include "super.h" +#include "osd_client.h" + +/* + * Ceph address space ops. + * + * There are a few funny things going on here. + * + * The page->private field is used to reference a struct + * ceph_snap_context for _every_ dirty page. This indicates which + * snapshot the page was logically dirtied in, and thus which snap + * context needs to be associated with the osd write during writeback. + * + * Similarly, struct ceph_inode_info maintains a set of counters to + * count dirty pages on the inode. In the absense of snapshots, + * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. + * + * When a snapshot is taken (that is, when the client receives + * notification that a snapshot was taken), each inode with caps and + * with dirty pages (dirty pages implies there is a cap) gets a new + * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending + * order, new snaps go to the tail). The i_wrbuffer_ref_head count is + * moved to capsnap->dirty. (Unless a sync write is currently in + * progress. In that case, the capsnap is said to be "pending", new + * writes cannot start, and the capsnap isn't "finalized" until the + * write completes (or fails) and a final size/mtime for the inode for + * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. + * + * On writeback, we must submit writes to the osd IN SNAP ORDER. So, + * we look for the first capsnap in i_cap_snaps and write out pages in + * that snap context _only_. Then we move on to the next capsnap, + * eventually reaching the "live" or "head" context (i.e., pages that + * are not yet snapped) and are writing the most recently dirtied + * pages. + * + * Invalidate and so forth must take care to ensure the dirty page + * accounting is preserved. + */ + +#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) +#define CONGESTION_OFF_THRESH(congestion_kb) \ + (CONGESTION_ON_THRESH(congestion_kb) - \ + (CONGESTION_ON_THRESH(congestion_kb) >> 2)) + + + +/* + * Dirty a page. Optimistically adjust accounting, on the assumption + * that we won't race with invalidate. If we do, readjust. + */ +static int ceph_set_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct ceph_inode_info *ci; + int undo = 0; + struct ceph_snap_context *snapc; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + if (TestSetPageDirty(page)) { + dout("%p set_page_dirty %p idx %lu -- already dirty\n", + mapping->host, page, page->index); + return 0; + } + + inode = mapping->host; + ci = ceph_inode(inode); + + /* + * Note that we're grabbing a snapc ref here without holding + * any locks! + */ + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + + /* dirty the head */ + spin_lock(&inode->i_lock); + if (ci->i_wrbuffer_ref_head == 0) + ci->i_head_snapc = ceph_get_snap_context(snapc); + ++ci->i_wrbuffer_ref_head; + if (ci->i_wrbuffer_ref == 0) + igrab(inode); + ++ci->i_wrbuffer_ref; + dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " + "snapc %p seq %lld (%d snaps)\n", + mapping->host, page, page->index, + ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, + ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, + snapc, snapc->seq, snapc->num_snaps); + spin_unlock(&inode->i_lock); + + /* now adjust page */ + spin_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); + task_io_account_write(PAGE_CACHE_SIZE); + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + + /* + * Reference snap context in page->private. Also set + * PagePrivate so that we get invalidatepage callback. + */ + page->private = (unsigned long)snapc; + SetPagePrivate(page); + } else { + dout("ANON set_page_dirty %p (raced truncate?)\n", page); + undo = 1; + } + + spin_unlock_irq(&mapping->tree_lock); + + if (undo) + /* whoops, we failed to dirty the page */ + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + BUG_ON(!PageDirty(page)); + return 1; +} + +/* + * If we are truncating the full page (i.e. offset == 0), adjust the + * dirty page counters appropriately. Only called if there is private + * data on the page. + */ +static void ceph_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_snap_context *snapc = (void *)page->private; + + BUG_ON(!PageLocked(page)); + BUG_ON(!page->private); + BUG_ON(!PagePrivate(page)); + BUG_ON(!page->mapping); + + inode = page->mapping->host; + + /* + * We can get non-dirty pages here due to races between + * set_page_dirty and truncate_complete_page; just spit out a + * warning, in case we end up with accounting problems later. + */ + if (!PageDirty(page)) + pr_err("%p invalidatepage %p page not dirty\n", inode, page); + + if (offset == 0) + ClearPageChecked(page); + + ci = ceph_inode(inode); + if (offset == 0) { + dout("%p invalidatepage %p idx %lu full dirty page %lu\n", + inode, page, page->index, offset); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); + page->private = 0; + ClearPagePrivate(page); + } else { + dout("%p invalidatepage %p idx %lu partial dirty page\n", + inode, page, page->index); + } +} + +/* just a sanity check */ +static int ceph_releasepage(struct page *page, gfp_t g) +{ + struct inode *inode = page->mapping ? page->mapping->host : NULL; + dout("%p releasepage %p idx %lu\n", inode, page, page->index); + WARN_ON(PageDirty(page)); + WARN_ON(page->private); + WARN_ON(PagePrivate(page)); + return 0; +} + +/* + * read a single page, without unlocking it. + */ +static int readpage_nounlock(struct file *filp, struct page *page) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + int err = 0; + u64 len = PAGE_CACHE_SIZE; + + dout("readpage inode %p file %p page %p index %lu\n", + inode, filp, page, page->index); + err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + page->index << PAGE_CACHE_SHIFT, &len, + ci->i_truncate_seq, ci->i_truncate_size, + &page, 1); + if (err == -ENOENT) + err = 0; + if (err < 0) { + SetPageError(page); + goto out; + } else if (err < PAGE_CACHE_SIZE) { + /* zero fill remainder of page */ + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } + SetPageUptodate(page); + +out: + return err < 0 ? err : 0; +} + +static int ceph_readpage(struct file *filp, struct page *page) +{ + int r = readpage_nounlock(filp, page); + unlock_page(page); + return r; +} + +/* + * Build a vector of contiguous pages from the provided page list. + */ +static struct page **page_vector_from_list(struct list_head *page_list, + unsigned *nr_pages) +{ + struct page **pages; + struct page *page; + int next_index, contig_pages = 0; + + /* build page vector */ + pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + BUG_ON(list_empty(page_list)); + next_index = list_entry(page_list->prev, struct page, lru)->index; + list_for_each_entry_reverse(page, page_list, lru) { + if (page->index == next_index) { + dout("readpages page %d %p\n", contig_pages, page); + pages[contig_pages] = page; + contig_pages++; + next_index++; + } else { + break; + } + } + *nr_pages = contig_pages; + return pages; +} + +/* + * Read multiple pages. Leave pages we don't read + unlock in page_list; + * the caller (VM) cleans them up. + */ +static int ceph_readpages(struct file *file, struct address_space *mapping, + struct list_head *page_list, unsigned nr_pages) +{ + struct inode *inode = file->f_dentry->d_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; + int rc = 0; + struct page **pages; + struct pagevec pvec; + loff_t offset; + u64 len; + + dout("readpages %p file %p nr_pages %d\n", + inode, file, nr_pages); + + pages = page_vector_from_list(page_list, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* guess read extent */ + offset = pages[0]->index << PAGE_CACHE_SHIFT; + len = nr_pages << PAGE_CACHE_SHIFT; + rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, + offset, &len, + ci->i_truncate_seq, ci->i_truncate_size, + pages, nr_pages); + if (rc == -ENOENT) + rc = 0; + if (rc < 0) + goto out; + + /* set uptodate and add to lru in pagevec-sized chunks */ + pagevec_init(&pvec, 0); + for (; !list_empty(page_list) && len > 0; + rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { + struct page *page = + list_entry(page_list->prev, struct page, lru); + + list_del(&page->lru); + + if (rc < (int)PAGE_CACHE_SIZE) { + /* zero (remainder of) page */ + int s = rc < 0 ? 0 : rc; + zero_user_segment(page, s, PAGE_CACHE_SIZE); + } + + if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + page_cache_release(page); + dout("readpages %p add_to_page_cache failed %p\n", + inode, page); + continue; + } + dout("readpages %p adding %p idx %lu\n", inode, page, + page->index); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + if (pagevec_add(&pvec, page) == 0) + pagevec_lru_add_file(&pvec); /* add to lru */ + } + pagevec_lru_add_file(&pvec); + rc = 0; + +out: + kfree(pages); + return rc; +} + +/* + * Get ref for the oldest snapc for an inode with dirty data... that is, the + * only snap context we are allowed to write back. + * + * Caller holds i_lock. + */ +static struct ceph_snap_context *__get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = NULL; + struct ceph_cap_snap *capsnap = NULL; + + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, + capsnap->context, capsnap->dirty_pages); + if (capsnap->dirty_pages) { + snapc = ceph_get_snap_context(capsnap->context); + if (snap_size) + *snap_size = capsnap->size; + break; + } + } + if (!snapc && ci->i_snap_realm) { + snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" head snapc %p has %d dirty pages\n", + snapc, ci->i_wrbuffer_ref_head); + } + return snapc; +} + +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) +{ + struct ceph_snap_context *snapc = NULL; + + spin_lock(&inode->i_lock); + snapc = __get_oldest_context(inode, snap_size); + spin_unlock(&inode->i_lock); + return snapc; +} + +/* + * Write a single page, but leave the page locked. + * + * If we get a write error, set the page error bit, but still adjust the + * dirty page accounting (i.e., page is no longer dirty). + */ +static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode; + struct ceph_inode_info *ci; + struct ceph_client *client; + struct ceph_osd_client *osdc; + loff_t page_off = page->index << PAGE_CACHE_SHIFT; + int len = PAGE_CACHE_SIZE; + loff_t i_size; + int err = 0; + struct ceph_snap_context *snapc; + u64 snap_size = 0; + long writeback_stat; + + dout("writepage %p idx %lu\n", page, page->index); + + if (!page->mapping || !page->mapping->host) { + dout("writepage %p - no mapping\n", page); + return -EFAULT; + } + inode = page->mapping->host; + ci = ceph_inode(inode); + client = ceph_inode_to_client(inode); + osdc = &client->osdc; + + /* verify this is a writeable snap context */ + snapc = (void *)page->private; + if (snapc == NULL) { + dout("writepage %p page %p not dirty?\n", inode, page); + goto out; + } + if (snapc != get_oldest_context(inode, &snap_size)) { + dout("writepage %p page %p snapc %p not writeable - noop\n", + inode, page, (void *)page->private); + /* we should only noop if called by kswapd */ + WARN_ON((current->flags & PF_MEMALLOC) == 0); + goto out; + } + + /* is this a partial page at end of file? */ + if (snap_size) + i_size = snap_size; + else + i_size = i_size_read(inode); + if (i_size < page_off + len) + len = i_size - page_off; + + dout("writepage %p page %p index %lu on %llu~%u\n", + inode, page, page->index, page_off, len); + + writeback_stat = atomic_long_inc_return(&client->writeback_count); + if (writeback_stat > + CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) + set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); + + set_page_writeback(page); + err = ceph_osdc_writepages(osdc, ceph_vino(inode), + &ci->i_layout, snapc, + page_off, len, + ci->i_truncate_seq, ci->i_truncate_size, + &inode->i_mtime, + &page, 1, 0, 0, true); + if (err < 0) { + dout("writepage setting page/mapping error %d %p\n", err, page); + SetPageError(page); + mapping_set_error(&inode->i_data, err); + if (wbc) + wbc->pages_skipped++; + } else { + dout("writepage cleaned page %p\n", page); + err = 0; /* vfs expects us to return 0 */ + } + page->private = 0; + ClearPagePrivate(page); + end_page_writeback(page); + ceph_put_wrbuffer_cap_refs(ci, 1, snapc); + ceph_put_snap_context(snapc); +out: + return err; +} + +static int ceph_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + struct inode *inode = page->mapping->host; + BUG_ON(!inode); + igrab(inode); + err = writepage_ |