aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 09:43:06 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 09:43:06 -0700
commitfc7f99cf36ebae853639dabb43bc2f0098c59aef (patch)
tree3ca7050397f515f91ef98f8b6293f9f7fd84ef02
parent0a492fdef8aa241f6139e6455e852cc710ae8ed1 (diff)
parentf1a3d57213fe264b4cf584e78bac36aaf9998729 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits) ceph: update for write_inode API change ceph: reset osd after relevant messages timed out ceph: fix flush_dirty_caps race with caps migration ceph: include migrating caps in issued set ceph: fix osdmap decoding when pools include (removed) snaps ceph: return EBADF if waiting for caps on closed file ceph: set osd request message front length correctly ceph: reset front len on return to msgpool; BUG on mismatched front iov ceph: fix snaptrace decoding on cap migration between mds ceph: use single osd op reply msg ceph: reset bits on connection close ceph: remove bogus mds forward warning ceph: remove fragile __map_osds optimization ceph: fix connection fault STANDBY check ceph: invalidate_authorizer without con->mutex held ceph: don't clobber write return value when using O_SYNC ceph: fix client_request_forward decoding ceph: drop messages on unregistered mds sessions; cleanup ceph: fix comments, locking in destroy_inode ceph: move dereference after NULL test ... Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
-rw-r--r--Documentation/filesystems/ceph.txt139
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1188
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c257
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c121
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c656
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c78
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2927
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c408
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c483
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1220
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c937
-rw-r--r--fs/ceph/inode.c1750
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3021
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h254
-rw-r--r--fs/ceph/mon_client.c834
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1537
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1019
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c54
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c904
-rw-r--r--fs/ceph/super.c1030
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c844
67 files changed, 28066 insertions, 0 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 00000000000..6e03917316b
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability. No single points of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre. Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons. Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.). File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs. When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures. The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads. In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation. The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system. Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes. That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes. This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects. (However, if the monitor you specify
+happens to be down, the mount won't succeed.) The port can be left
+off if the monitor is using the default. So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient. If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+ ip=A.B.C.D[:N]
+ Specify the IP and/or port the client should bind to locally.
+ There is normally not much reason to do this. If the IP is not
+ specified, the client's IP address is determined by looking at the
+ address it's connection to the monitor originates from.
+
+ wsize=X
+ Specify the maximum write size in bytes. By default there is no
+ maximu. Ceph will normally size writes based on the file stripe
+ size.
+
+ rsize=X
+ Specify the maximum readahead.
+
+ mount_timeout=X
+ Specify the timeout value for mount (in seconds), in the case
+ of a non-responsive Ceph file system. The default is 30
+ seconds.
+
+ rbytes
+ When stat() is called on a directory, set st_size to 'rbytes',
+ the summation of file sizes over all files nested beneath that
+ directory. This is the default.
+
+ norbytes
+ When stat() is called on a directory, set st_size to the
+ number of entries in that directory.
+
+ nocrc
+ Disable CRC32C calculation for data writes. If set, the OSD
+ must rely on TCP's error correction to detect data corruption
+ in the data payload.
+
+ noasyncreaddir
+ Disable client's use its local cache to satisfy readdir
+ requests. (This does not change correctness; the client uses
+ cached metadata only when a lease or capability ensures it is
+ valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+ http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+ git://ceph.newdream.net/linux-ceph-client.git
+
+and the source for the full system is at
+ git://ceph.newdream.net/ceph.git
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35c9b51d20e..dd5806f4fcc 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments
0x92 00-0F drivers/usb/mon/mon_bin.c
0x93 60-7F linux/auto_fs.h
0x94 all fs/btrfs/ioctl.h
+0x97 00-7F fs/ceph/ioctl.h Ceph file system
0x99 00-0F 537-Addinboard driver
<mailto:buk@buks.ipn.de>
0xA0 all linux/sdp/sdp.h Industrial Device Project
diff --git a/MAINTAINERS b/MAINTAINERS
index 382eaa4d006..449d4440208 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h
F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/
+CEPH DISTRIBUTED FILE SYSTEM CLIENT
+M: Sage Weil <sage@newdream.net>
+L: ceph-devel@lists.sourceforge.net
+W: http://ceph.newdream.net/
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+S: Supported
+F: Documentation/filesystems/ceph.txt
+F: fs/ceph
+
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel <david.vrabel@csr.com>
L: linux-usb@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be6..5f85b594761 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
source "net/sunrpc/Kconfig"
source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
source "fs/ncpfs/Kconfig"
source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa4691..97f340f14ba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_EXOFS_FS) += exofs/
+obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 00000000000..04b8280582a
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+ tristate "Ceph distributed file system (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ select LIBCRC32C
+ select CONFIG_CRYPTO_AES
+ help
+ Choose Y or M here to include support for mounting the
+ experimental Ceph distributed file system. Ceph is an extremely
+ scalable file system designed to provide high performance,
+ reliable access to petabytes of storage.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_FS
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This icnreases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 00000000000..6a660e610be
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ export.o caps.o snap.o xattr.o \
+ messenger.o msgpool.o buffer.o pagelist.o \
+ mds_client.o mdsmap.o \
+ mon_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ crypto.o armor.o \
+ auth_x.o \
+ ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 00000000000..18352fab37c
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland kernel
+src/include/ceph_fs.h fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc fs/ceph/ceph_fs.c
+src/include/msgr.h fs/ceph/msgr.h
+src/include/rados.h fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc fs/ceph/ceph_frag.c
+src/include/ceph_hash.h fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc fs/ceph/ceph_hash.c
+src/crush/crush.c fs/ceph/crush/crush.c
+src/crush/crush.h fs/ceph/crush/crush.h
+src/crush/mapper.c fs/ceph/crush/mapper.c
+src/crush/mapper.h fs/ceph/crush/mapper.h
+src/crush/hash.h fs/ceph/crush/hash.h
+src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 00000000000..23bb0ceabe3
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h> /* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page. This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode. In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress. In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_. Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb) \
+ (CONGESTION_ON_THRESH(congestion_kb) - \
+ (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page. Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate. If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ int undo = 0;
+ struct ceph_snap_context *snapc;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ if (TestSetPageDirty(page)) {
+ dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+ mapping->host, page, page->index);
+ return 0;
+ }
+
+ inode = mapping->host;
+ ci = ceph_inode(inode);
+
+ /*
+ * Note that we're grabbing a snapc ref here without holding
+ * any locks!
+ */
+ snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+ /* dirty the head */
+ spin_lock(&inode->i_lock);
+ if (ci->i_wrbuffer_ref_head == 0)
+ ci->i_head_snapc = ceph_get_snap_context(snapc);
+ ++ci->i_wrbuffer_ref_head;
+ if (ci->i_wrbuffer_ref == 0)
+ igrab(inode);
+ ++ci->i_wrbuffer_ref;
+ dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ mapping->host, page, page->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
+ spin_unlock(&inode->i_lock);
+
+ /* now adjust page */
+ spin_lock_irq(&mapping->tree_lock);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ }
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+
+ /*
+ * Reference snap context in page->private. Also set
+ * PagePrivate so that we get invalidatepage callback.
+ */
+ page->private = (unsigned long)snapc;
+ SetPagePrivate(page);
+ } else {
+ dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+ undo = 1;
+ }
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (undo)
+ /* whoops, we failed to dirty the page */
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ BUG_ON(!PageDirty(page));
+ return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately. Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_context *snapc = (void *)page->private;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!page->private);
+ BUG_ON(!PagePrivate(page));
+ BUG_ON(!page->mapping);
+
+ inode = page->mapping->host;
+
+ /*
+ * We can get non-dirty pages here due to races between
+ * set_page_dirty and truncate_complete_page; just spit out a
+ * warning, in case we end up with accounting problems later.
+ */
+ if (!PageDirty(page))
+ pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ ci = ceph_inode(inode);
+ if (offset == 0) {
+ dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+ inode, page, page->index, offset);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
+ } else {
+ dout("%p invalidatepage %p idx %lu partial dirty page\n",
+ inode, page, page->index);
+ }
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+ struct inode *inode = page->mapping ? page->mapping->host : NULL;
+ dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+ WARN_ON(PageDirty(page));
+ WARN_ON(page->private);
+ WARN_ON(PagePrivate(page));
+ return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ int err = 0;
+ u64 len = PAGE_CACHE_SIZE;
+
+ dout("readpage inode %p file %p page %p index %lu\n",
+ inode, filp, page, page->index);
+ err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ page->index << PAGE_CACHE_SHIFT, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &page, 1);
+ if (err == -ENOENT)
+ err = 0;
+ if (err < 0) {
+ SetPageError(page);
+ goto out;
+ } else if (err < PAGE_CACHE_SIZE) {
+ /* zero fill remainder of page */
+ zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ }
+ SetPageUptodate(page);
+
+out:
+ return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+ int r = readpage_nounlock(filp, page);
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+ unsigned *nr_pages)
+{
+ struct page **pages;
+ struct page *page;
+ int next_index, contig_pages = 0;
+
+ /* build page vector */
+ pages = kmalloc(sizeof(