aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/Makefile2
-rw-r--r--fs/afs/dir.c6
-rw-r--r--fs/afs/file.c64
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/block_dev.c330
-rw-r--r--fs/btrfs/acl.c4
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/inode.c11
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/auth.c9
-rw-r--r--fs/ceph/auth.h2
-rw-r--r--fs/ceph/auth_none.c1
-rw-r--r--fs/ceph/auth_x.c19
-rw-r--r--fs/ceph/caps.c24
-rw-r--r--fs/ceph/ceph_fs.h62
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/dir.c45
-rw-r--r--fs/ceph/export.c14
-rw-r--r--fs/ceph/file.c19
-rw-r--r--fs/ceph/inode.c97
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c385
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/messenger.c91
-rw-r--r--fs/ceph/messenger.h10
-rw-r--r--fs/ceph/mon_client.c257
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c98
-rw-r--r--fs/ceph/pagelist.c2
-rw-r--r--fs/ceph/rados.h23
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c128
-rw-r--r--fs/ceph/super.h30
-rw-r--r--fs/ceph/xattr.c35
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/devpts/inode.c9
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/ecryptfs/main.c166
-rw-r--r--fs/ecryptfs/mmap.c19
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c22
-rw-r--r--fs/exofs/inode.c11
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/ialloc.c12
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/acl.c4
-rw-r--r--fs/ext3/fsync.c3
-rw-r--r--fs/ext3/ialloc.c13
-rw-r--r--fs/ext3/xattr.c10
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/ext4/ialloc.c12
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c104
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/rgrp.c5
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/inode.c26
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/minix/bitmap.c5
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/namei.c5
-rw-r--r--fs/nfsd/nfs4recover.c87
-rw-r--r--fs/nfsd/vfs.c5
-rw-r--r--fs/nilfs2/inode.c11
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/inotify/inotify.c88
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/namei.c9
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/inode.c4
-rw-r--r--fs/open.c166
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c93
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c89
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c13
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c87
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c122
-rw-r--r--fs/quota/quota.c32
-rw-r--r--fs/ramfs/inode.c20
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/splice.c151
-rw-r--r--fs/statfs.c196
-rw-r--r--fs/super.c321
-rw-r--r--fs/sync.c88
-rw-r--r--fs/sysv/ialloc.c11
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/udf/ialloc.c11
-rw-r--r--fs/udf/namei.c10
-rw-r--r--fs/ufs/ialloc.c10
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/xfs_acl.h4
172 files changed, 2550 insertions, 2321 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 0ba2db44e0b..4331b3b5ee1 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -256,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
return ERR_PTR(-ENOMEM);
}
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba..e6ec1d309b1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o fs_struct.o
+ stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b5..b42d5cc1d6d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
struct key *key)
{
struct page *page;
- struct file file = {
- .private_data = key,
- };
-
_enter("{%lu},%lu", dir->i_ino, index);
- page = read_mapping_page(dir->i_mapping, index, &file);
+ page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
if (!IS_ERR(page)) {
kmap(page);
if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724..14d89fa58fe 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
#endif
/*
- * AFS read page from file, directory or symlink
+ * read page from file, directory or symlink, given a key to use
*/
-static int afs_readpage(struct file *file, struct page *page)
+int afs_page_filler(void *data, struct page *page)
{
- struct afs_vnode *vnode;
- struct inode *inode;
- struct key *key;
+ struct inode *inode = page->mapping->host;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ struct key *key = data;
size_t len;
off_t offset;
int ret;
- inode = page->mapping->host;
-
- if (file) {
- key = file->private_data;
- ASSERT(key != NULL);
- } else {
- key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
- if (IS_ERR(key)) {
- ret = PTR_ERR(key);
- goto error_nokey;
- }
- }
-
_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
- vnode = AFS_FS_I(inode);
-
BUG_ON(!PageLocked(page));
ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
unlock_page(page);
}
- if (!file)
- key_put(key);
_leave(" = 0");
return 0;
error:
SetPageError(page);
unlock_page(page);
- if (!file)
- key_put(key);
-error_nokey:
_leave(" = %d", ret);
return ret;
}
/*
+ * read page from file, directory or symlink, given a file to nominate the key
+ * to be used
+ */
+static int afs_readpage(struct file *file, struct page *page)
+{
+ struct key *key;
+ int ret;
+
+ if (file) {
+ key = file->private_data;
+ ASSERT(key != NULL);
+ ret = afs_page_filler(key, page);
+ } else {
+ struct inode *inode = page->mapping->host;
+ key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ } else {
+ ret = afs_page_filler(key, page);
+ key_put(key);
+ }
+ }
+ return ret;
+}
+
+/*
* read a set of pages
*/
static int afs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct key *key = file->private_data;
struct afs_vnode *vnode;
int ret = 0;
- _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages);
+ _enter("{%d},{%lu},,%d",
+ key_serial(key), mapping->host->i_ino, nr_pages);
+
+ ASSERT(key != NULL);
vnode = AFS_FS_I(mapping->host);
if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
}
/* load the missing pages from the network */
- ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file);
+ ret = read_cache_pages(mapping, pages, afs_page_filler, key);
_leave(" = %d [netting]", ret);
return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844..807f284cc75 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations;
extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
+extern int afs_page_filler(void *, struct page *);
/*
* flock.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d..a9e23039ea3 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
*/
int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
{
- struct file file = {
- .private_data = key,
- };
struct page *page;
size_t size;
char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
/* read the contents of the symlink into the pagecache */
- page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
+ page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
+ afs_page_filler, key);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index e4b75d6eda8..9bd4b3876c9 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
+ inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb106..8f73841fc97 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
}
set_bit(ino, info->si_imap);
info->si_freei--;
- inode->i_uid = current_fsuid();
- inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
+ inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
inode->i_mapping->a_ops = &bfs_aops;
- inode->i_mode = mode;
inode->i_ino = ino;
BFS_I(inode)->i_dsk_ino = ino;
BFS_I(inode)->i_sblock = 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5..26e5f502662 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
sb = get_active_super(bdev);
if (!sb)
goto out;
- if (sb->s_flags & MS_RDONLY) {
- sb->s_frozen = SB_FREEZE_TRANS;
- up_write(&sb->s_umount);
+ error = freeze_super(sb);
+ if (error) {
+ deactivate_super(sb);
+ bdev->bd_fsfreeze_count--;
mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return sb;
- }
-
- sb->s_frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
- sync_filesystem(sb);
-
- sb->s_frozen = SB_FREEZE_TRANS;
- smp_wmb();
-
- sync_blockdev(sb->s_bdev);
-
- if (sb->s_op->freeze_fs) {
- error = sb->s_op->freeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem freeze failed\n");
- sb->s_frozen = SB_UNFROZEN;
- deactivate_locked_super(sb);
- bdev->bd_fsfreeze_count--;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return ERR_PTR(error);
- }
+ return ERR_PTR(error);
}
- up_write(&sb->s_umount);
-
+ deactivate_super(sb);
out:
sync_blockdev(bdev);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
mutex_lock(&bdev->bd_fsfreeze_mutex);
if (!bdev->bd_fsfreeze_count)
- goto out_unlock;
+ goto out;
error = 0;
if (--bdev->bd_fsfreeze_count > 0)
- goto out_unlock;
+ goto out;
if (!sb)
- goto out_unlock;
-
- BUG_ON(sb->s_bdev != bdev);
- down_write(&sb->s_umount);
- if (sb->s_flags & MS_RDONLY)
- goto out_unfrozen;
-
- if (sb->s_op->unfreeze_fs) {
- error = sb->s_op->unfreeze_fs(sb);
- if (error) {
- printk(KERN_ERR
- "VFS:Filesystem thaw failed\n");
- sb->s_frozen = SB_FREEZE_TRANS;
- bdev->bd_fsfreeze_count++;
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- return error;
- }
- }
-
-out_unfrozen:
- sb->s_frozen = SB_UNFROZEN;
- smp_wmb();
- wake_up(&sb->s_wait_unfrozen);
+ goto out;
- if (sb)
- deactivate_locked_super(sb);
-out_unlock:
+ error = thaw_super(sb);
+ if (error) {
+ bdev->bd_fsfreeze_count++;
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return error;
+ }
+out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return 0;
}
@@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
*/
mutex_unlock(&bd_inode->i_mutex);
- error = blkdev_issue_flush(bdev, NULL);
+ error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
if (error == -EOPNOTSUPP)
error = 0;
@@ -668,41 +627,209 @@ void bd_forget(struct inode *inode)
iput(bdev->bd_inode);
}
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
- int res;
- spin_lock(&bdev_lock);
-
- /* first decide result */
if (bdev->bd_holder == holder)
- res = 0; /* already a holder */
+ return true; /* already a holder */
else if (bdev->bd_holder != NULL)
- res = -EBUSY; /* held by someone else */
+ return false; /* held by someone else */
else if (bdev->bd_contains == bdev)
- res = 0; /* is a whole device which isn't held */
+ return true; /* is a whole device which isn't held */
- else if (bdev->bd_contains->bd_holder == bd_claim)
- res = 0; /* is a partition of a device that is being partitioned */
- else if (bdev->bd_contains->bd_holder != NULL)
- res = -EBUSY; /* is a partition of a held device */
+ else if (whole->bd_holder == bd_claim)
+ return true; /* is a partition of a device that is being partitioned */
+ else if (whole->bd_holder != NULL)
+ return false; /* is a partition of a held device */
else
- res = 0; /* is a partition of an un-held device */
+ return true; /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev. This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress. This function doesn't actually claim. On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+ struct block_device *whole, void *holder)
+{
+retry:
+ /* if someone else claimed, fail */
+ if (!bd_may_claim(bdev, whole, holder))
+ return -EBUSY;
+
+ /* if someone else is claiming, wait for it to finish */
+ if (whole->bd_claiming && whole->bd_claiming != holder) {
+ wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&bdev_lock);
+ schedule();
+ finish_wait(wq, &wait);
+ spin_lock(&bdev_lock);
+ goto retry;
+ }
+
+ /* yay, all mine */
+ return 0;
+}
- /* now impose change */
- if (res==0) {
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively. Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress. Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming(). If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+ void *holder)
+{
+ struct gendisk *disk;
+ struct block_device *whole;
+ int partno, err;
+
+ might_sleep();
+
+ /*
+ * @bdev might not have been initialized properly yet, look up
+ * and grab the outer block device the hard way.
+ */
+ disk = get_gendisk(bdev->bd_dev, &partno);
+ if (!disk)
+ return ERR_PTR(-ENXIO);
+
+ whole = bdget_disk(disk, 0);
+ put_disk(disk);
+ if (!whole)
+ return ERR_PTR(-ENOMEM);
+
+ /* prepare to claim, if successful, mark claiming in progress */
+ spin_lock(&bdev_lock);
+
+ err = bd_prepare_to_claim(bdev, whole, holder);
+ if (err == 0) {
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
+ return whole;
+ } else {
+ spin_unlock(&bdev_lock);
+ bdput(whole);
+ return ERR_PTR(err);
+ }
+}
+
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+ BUG_ON(whole->bd_claiming != holder);
+ whole->bd_claiming = NULL;
+ wake_up_bit(&whole->bd_claiming, 0);
+
+ spin_unlock(&bdev_lock);
+ bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming(). Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+ spin_lock(&bdev_lock);
+ __bd_abort_claiming(whole, holder); /* releases bdev_lock */
+}
+
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully. This
+ * function may be called with or without preceding
+ * blk_start_claiming(). In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+ struct block_device *whole = bdev->bd_contains;
+ int res;
+
+ might_sleep();
+
+ spin_lock(&bdev_lock);
+
+ res = bd_prepare_to_claim(bdev, whole, holder);
+ if (res == 0) {
/* note that for a whole device bd_holders
* will be incremented twice, and bd_holder will
* be set to bd_claim before being set to holder
*/
- bdev->bd_contains->bd_holders ++;
- bdev->bd_contains->bd_holder = bd_claim;
+ whole->bd_holders++;
+ whole->bd_holder = bd_claim;
bdev->bd_holders++;
bdev->bd_holder = holder;
}
- spin_unlock(&bdev_lock);
+
+ if (whole->bd_claiming)
+ __bd_abort_claiming(whole, holder); /* releases bdev_lock */
+ else
+ spin_unlock(&bdev_lock);
+
return res;
}
-
EXPORT_SYMBOL(bd_claim);
void bd_release(struct block_device *bdev)
@@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get);
static int blkdev_open(struct inode * inode, struct file * filp)
{
+ struct block_device *whole = NULL;
struct block_device *bdev;
int res;
@@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
if (bdev == NULL)
return -ENOMEM;
+ if (filp->f_mode & FMODE_EXCL) {
+ whole = bd_start_claiming(bdev, filp);
+ if (IS_ERR(whole)) {
+ bdput(bdev);
+ return PTR_ERR(whole);
+ }
+ }
+
filp->f_mapping = bdev->bd_inode->i_mapping;
res = blkdev_get(bdev, filp->f_mode);
- if (res)
- return res;
- if (filp->f_mode & FMODE_EXCL) {
- res = bd_claim(bdev, filp);
- if (res)
- goto out_blkdev_put;
+ if (whole) {
+ if (res == 0)
+ BUG_ON(bd_claim(bdev, filp) != 0);
+ else
+ bd_abort_claiming(whole, filp);
}
- return 0;
-
- out_blkdev_put:
- blkdev_put(bdev, filp->f_mode);
return res;
}
@@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev);
*/
struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
{
- struct block_device *bdev;
- int error = 0;
+ struct block_device *bdev, *whole;
+ int error;
bdev = lookup_bdev(path);
if (IS_ERR(bdev))
return bdev;
+ whole = bd_start_claiming(bdev, holder);
+ if (IS_ERR(whole)) {
+ bdput(bdev);
+ return whole;
+ }
+
error = blkdev_get(bdev, mode);
if (error)
- return ERR_PTR(error);
+ goto out_abort_claiming;
+
error = -EACCES;
if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
- goto blkdev_put;
- error = bd_claim(bdev, holder);
- if (error)
- goto blkdev_put;
+ goto out_blkdev_put;
+ BUG_ON(bd_claim(bdev, holder) != 0);
return bdev;
-
-blkdev_put:
+
+out_blkdev_put:
blkdev_put(bdev, mode);
+out_abort_claiming:
+ bd_abort_claiming(whole, holder);
return ERR_PTR(error);
}
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724e..8d432cd9d58 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode)
return ret;
}
-struct xattr_handler btrfs_xattr_acl_default_handler = {
+const struct xattr_handler btrfs_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = btrfs_xattr_acl_get,
.set = btrfs_xattr_acl_set,
};
-struct xattr_handler btrfs_xattr_acl_access_handler = {
+const struct xattr_handler btrfs_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaae..c6a4f459ad7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
}
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e..d601629b85d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail;
- inode->i_uid = current_fsuid();
-
- if (dir && (dir->i_mode & S_ISGID)) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = objectid;
inode_set_bytes(inode, 0);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f..59acd3eb288 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -282,7 +282,7 @@ err:
* List of handlers for synthetic system.* attributes. All real ondisk
* attributes are handled directly.
*/
-struct xattr_handler *btrfs_xattr_handlers[] = {
+const struct xattr_handler *btrfs_xattr_handlers[] = {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
&btrfs_xattr_acl_access_handler,
&btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e..7a43fd640bb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
#include <linux/xattr.h>
-extern struct xattr_handler btrfs_xattr_acl_access_handler;
-extern struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler *btrfs_xattr_handlers[];
+extern const struct xattr_handler btrfs_xattr_acl_access_handler;
+extern const struct xattr_handler btrfs_xattr_acl_default_handler;
+extern const struct xattr_handler *btrfs_xattr_handlers[];
extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db062..e8aa7081d25 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
return;
invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
return err;
}
-static void do_thaw_all(struct work_struct *work)
+static void do_thaw_one(struct super_block *sb, void *unused)
{
- struct super_block *sb;
char b[BDEVNAME_SIZE];
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+ printk(KERN_WARNING "Emergency Thaw on %s\n",
+ bdevname(sb->s_bdev, b));
+}
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
- printk(KERN_WARNING "Emergency Thaw on %s\n",
- bdevname(sb->s_bdev, b));
- up_read(&sb->s_umount);
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
+static void do_thaw_all(struct work_struct *work)
+{
+ iterate_supers(do_thaw_one, NULL);
kfree(work);
printk(KERN_WARNING "Emergency Thaw complete\n");
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed..d9c60b84949 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
int rc = 0;
struct page **pages;
- struct pagevec pvec;
loff_t offset;
u64 len;
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc < 0)
goto out;
- /* set uptodate and add to lru in pagevec-sized chunks */
- pagevec_init(&pvec, 0);
for (; !list_empty(page_list) && len > 0;
rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
zero_user_segment(page, s, PAGE_CACHE_SIZE);
}
- if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
page_cache_release(page);
dout("readpages %p add_to_page_cache failed %p\n",
inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
- if (pagevec_add(&pvec, page) == 0)
- pagevec_lru_add_file(&pvec); /* add to lru */
+ page_cache_release(page);
}
- pagevec_lru_add_file(&pvec);
rc = 0;
out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
ceph_release_pages(req->r_pages, req->r_num_pages);
if (req->r_pages_from_pool)
mempool_free(req->r_pages,
- ceph_client(inode->i_sb)->wb_pagevec_pool);
+ ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
else
kfree(req->r_pages);
ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c..9f46de2ba7a 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) {
- pr_err("error %d building request\n", ret);
+ pr_err("error %d building auth method %s request\n", ret,
+ ac->ops->name);
return ret;
}
dout(" built request %d bytes\n", ret);
@@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
if (ac->protocol != protocol) {
ret = ceph_auth_init_protocol(ac, protocol);
if (ret) {
- pr_err("error %d on auth protocol %d init\n",
- ret, protocol);
+ pr_err("error %d on auth method %s init\n",
+ ret, ac->ops->name);
goto out;
}
}
@@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
if (ret == -EAGAIN) {
return ceph_build_auth_request(ac, reply_buf, reply_len);
} else if (ret) {
- pr_err("authentication error %d\n", ret);
+ pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
return ret;
}
return 0;
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb26..4429a707c02 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
struct ceph_authorizer;
struct ceph_auth_client_ops {
+ const char *name;
+
/*
* true if we are authenticated and can connect to
* services.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f..24407c11929 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
}
static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .name = "none",
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da88..7b206231566 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
int ret;
char *dbuf;
char *ticket_buf;
- u8 struct_v;
+ u8 reply_struct_v;
dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
if (!dbuf)
@@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
goto out_dbuf;
ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
- struct_v = ceph_decode_8(&p);
- if (struct_v != 1)
+ reply_struct_v = ceph_decode_8(&p);
+ if (reply_struct_v != 1)
goto bad;
num = ceph_decode_32(&p);
dout("%d tickets\n", num);
while (num--) {
int type;
- u8 struct_v;
+ u8 tkt_struct_v, blob_struct_v;
struct ceph_x_ticket_handler *th;
void *dp, *dend;
int dlen;
@@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
type = ceph_decode_32(&p);
dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
- struct_v = ceph_decode_8(&p);
- if (struct_v != 1)
+ tkt_struct_v = ceph_decode_8(&p);
+ if (tkt_struct_v != 1)
goto bad;
th = get_ticket_handler(ac, type);
@@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
dend = dbuf + dlen;
dp = dbuf;
- struct_v = ceph_decode_8(&dp);
- if (struct_v != 1)
+ tkt_struct_v = ceph_decode_8(&dp);
+ if (tkt_struct_v != 1)
goto bad;
memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
tpend = tp + dlen;
dout(" ticket blob is %d bytes\n", dlen);
ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
- struct_v = ceph_decode_8(&tp);
+ blob_struct_v = ceph_decode_8(&tp);
new_secret_id = ceph_decode_64(&tp);
ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
if (ret)
@@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
static const struct ceph_auth_client_ops ceph_x_ops = {
+ .name = "x",
.is_authenticated = ceph_x_is_authenticated,
.build_request = ceph_x_build_request,
.handle_reply = ceph_x_handle_reply,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b27..0dd0b81e64f 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
{
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
- struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+ struct ceph_mds_client *mdsc =
+ &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
int removed = 0;
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
seq, issue_seq, mseq, follows, size, max_size,
xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
+ if (!msg)
+ return -ENOMEM;
msg->hdr.tid = cpu_to_le64(flush_tid);
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
*/
void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
{
- struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+ struct ceph_mds_client *mdsc =
+ &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct inode *inode = &ci->vfs_inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
static int __mark_caps_flushing(struct inode *inode,
struct ceph_mds_session *session)
{
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int flushing;
@@ -1663,7 +1665,7 @@ ack:
static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
unsigned *flush_tid)
{
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int unlock_session = session ? 0 : 1;
int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
static int caps_are_flushed(struct inode *inode, unsigned tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- int dirty, i, ret = 1;
+ int i, ret = 1;
spin_lock(&inode->i_lock);
- dirty = __ceph_caps_dirty(ci);
for (i = 0; i < CEPH_CAP_BITS; i++)
if ((ci->i_flushing_caps & (1 << i)) &&
ci->i_cap_flush_tid[i] <= tid) {
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
err = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
} else {
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc =
+ &ceph_sb_to_client(inode->i_sb)->mdsc;
spin_lock(&inode->i_lock);
if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
__releases(inode->i_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef365..3b9eeed097b 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
* Ceph release version
*/
#define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
#define CEPH_VERSION_PATCH 0
#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
* client-facing protocol.
*/
#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
#define CEPH_MON_PROTOCOL 5 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 24 /* server/client */
#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
/*
* feature bits
*/
-#define CEPH_FEATURE_SUPPORTED 0
-#define CEPH_FEATURE_REQUIRED 0
+#define CEPH_FEATURE_UID 1
+#define CEPH_FEATURE_NOSRCADDR 2
+#define CEPH_FEATURE_FLOCK 4
+
+#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
#define CEPH_AUTH_NONE 0x1
#define CEPH_AUTH_CEPHX 0x2
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
+
/*********************************************
* message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
#define CEPH_MSG_CLIENT_SNAP 0x312
#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY 48
+#define CEPH_MSG_POOLOP 49
+
+
/* osd */
#define CEPH_MSG_OSD_MAP 41
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
+/* pool operations */
+enum {
+ POOL_OP_CREATE = 0x01,
+ POOL_OP_DELETE = 0x02,
+ POOL_OP_AUID_CHANGE = 0x03,
+ POOL_OP_CREATE_SNAP = 0x11,
+ POOL_OP_DELETE_SNAP = 0x12,
+ POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
+ POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
+};
+
struct ceph_mon_request_header {
__le64 have_version;
__le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
struct ceph_statfs st;
} __attribute__ ((packed));
+const char *ceph_pool_op_name(int op);
+
+struct ceph_mon_poolop {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 pool;
+ __le32 op;
+ __le64 auid;
+ __le64 snapid;
+ __le32 name_len;
+} __attribute__ ((packed));
+
+struct ceph_mon_poolop_reply {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 reply_code;
+ __le32 epoch;
+ char has_data;
+ char data[0];
+} __attribute__ ((packed));
+
+struct ceph_mon_unmanaged_snap {
+ __le64 snapid;
+} __attribute__ ((packed));
+
struct ceph_osd_getmap {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
@@ -308,6 +361,7 @@ union ceph_mds_request_args {
struct {
__le32 frag; /* which dir fragment */
__le32 max_entries; /* how many dentries to grab */
+ __le32 max_bytes;
} __attribute__ ((packed)) readdir;
struct {
__le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c6..7503aee828c 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
case CEPH_ENTITY_TYPE_OSD: return "osd";
case CEPH_ENTITY_TYPE_MON: return "mon";
case CEPH_ENTITY_TYPE_CLIENT: return "client";
- case CEPH_ENTITY_TYPE_ADMIN: return "admin";
case CEPH_ENTITY_TYPE_AUTH: return "auth";
default: return "unknown";
}
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+ case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
case CEPH_OSD_OP_PULL: return "pull";
case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
}
return "???";
}
+
+const char *ceph_pool_op_name(int op)
+{
+ switch (op) {
+ case POOL_OP_CREATE: return "create";
+ case POOL_OP_DELETE: return "delete";
+ case POOL_OP_AUID_CHANGE: return "auid change";
+ case POOL_OP_CREATE_SNAP: return "create snap";
+ case POOL_OP_DELETE_SNAP: return "delete snap";
+ case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+ case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+ }
+ return "???";
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92ac..3be33fb066c 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
static int monc_show(struct seq_file *s, void *p)
{
struct ceph_client *client = s->private;
- struct ceph_mon_statfs_request *req;
+ struct ceph_mon_generic_request *req;
struct ceph_mon_client *monc = &client->monc;
struct rb_node *rp;
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
if (monc->want_next_osdmap)
seq_printf(s, "want next osdmap\n");
- for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
- req = rb_entry(rp, struct ceph_mon_statfs_request, node);
- seq_printf(s, "%lld statfs\n", req->tid);
+ for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
+ __u16 op;
+ req = rb_entry(rp, struct ceph_mon_generic_request, node);
+ op = le16_to_cpu(req->request->hdr.type);
+ if (op == CEPH_MSG_STATFS)
+ seq_printf(s, "%lld statfs\n", req->tid);
+ else
+ seq_printf(s, "%lld unknown\n", req->tid);
}
mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed2..4fd30900eff 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
return -ENOMEM; /* oh well */
spin_lock(&dentry->d_lock);
- if (dentry->d_fsdata) /* lost a race */
+ if (dentry->d_fsdata) {
+ /* lost a race */
+ kmem_cache_free(ceph_dentry_cachep, di);
goto out_unlock;
+ }
di->dentry = dentry;
di->lease_session = NULL;
dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
dentry = list_entry(p, struct dentry, d_u.d_child);
di = ceph_dentry(dentry);
while (1) {
- dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+ dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+ d_unhashed(dentry) ? "!hashed" : "hashed",
parent->d_subdirs.prev, parent->d_subdirs.next);
if (p == &parent->d_subdirs) {
fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
const int max_entries = client->mount_args->max_readdir;
+ const int max_bytes = client->mount_args->max_readdir_bytes;
dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
if (fi->at_end)
@@ -312,6 +317,7 @@ more:
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
@@ -335,7 +341,7 @@ more:
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 0;
+ fi->next_offset = 2;
} else {
rinfo = &req->r_reply_info;
err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
struct dentry *dentry, int err)
{
- struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = dentry->d_parent->d_inode;
/* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
!is_root_ceph_dentry(dir, dentry) &&
(ci->i_ceph_flags & CEPH_I_COMPLETE) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
- di->offset = ci->i_max_offset++;
spin_unlock(&dir->i_lock);
dout(" dir %p complete, -ENOENT\n", dir);
d_add(dentry, NULL);
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
- new_dentry->d_time = jiffies;
- ceph_dentry(new_dentry)->lease_shared_gen = 0;
+ ceph_invalidate_dentry_lease(new_dentry);
}
ceph_mdsc_put_request(req);
return err;
}
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+ spin_lock(&dentry->d_lock);
+ dentry->d_time = jiffies;
+ ceph_dentry(dentry)->lease_shared_gen = 0;
+ spin_unlock(&dentry->d_lock);
+}
/*
* Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *dir = dentry->d_parent->d_inode;
- dout("d_revalidate %p '%.*s' inode %p\n", dentry,
- dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+ ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
- if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+ if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!cf->dir_info) {
@@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
+ mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_add_tail(&di->lru, &mdsc->dentry_lru);
mdsc->num_dentry++;
@@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
struct ceph_dentry_info *di = ceph_dentry(dn);
struct ceph_mds_client *mdsc;
- dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
- dn->d_name.len, dn->d_name.name);
+ dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
+ dn->d_name.len, dn->d_name.name, di->offset);
if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
+ mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_move_tail(&di->lru, &mdsc->dentry_lru);
spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
dn->d_name.len, dn->d_name.name);
if (di) {
- mdsc = &ceph_client(dn->d_sb)->mdsc;
+ mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
spin_lock(&mdsc->dentry_lru_lock);
list_del_init(&di->lru);
mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb32..17447644d67 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
dentry = d_obtain_alias(inode);
- if (!dentry) {
+ if (IS_ERR(dentry)) {
pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
fh->ino, inode);
iput(inode);
- return ERR_PTR(-ENOMEM);
+ return dentry;
}
err = ceph_init_dentry(dentry);
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
static struct dentry *__cfh_to_dentry(struct super_block *sb,
struct ceph_nfs_confh *cfh)
{
- struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
}
dentry = d_obtain_alias(inode);
- if (!dentry) {
+ if (IS_ERR(dentry)) {
pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
cfh->ino, inode);
iput(inode);
- return ERR_PTR(-ENOMEM);
+ return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
return ERR_PTR(-ESTALE);
dentry = d_obtain_alias(inode);
- if (!dentry) {
+ if (IS_ERR(dentry)) {
pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
cfh->ino, inode);
iput(inode);
- return ERR_PTR(-ENOMEM);
+ return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6..6512b6701b9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
/*
* allocate a vector new pages
*/
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
{
struct page **pages;
int i;
- pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+ pages = kmalloc(sizeof(*pages) * num_pages, flags);
if (!pages)
return ERR_PTR(-ENOMEM);
for (i = 0; i < num_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
+ pages[i] = __page_cache_alloc(flags);
if (pages[i] == NULL) {
ceph_release_page_vector(pages, i);
return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
* in sequence.
*/
} else {
- pages = alloc_page_vector(num_pages);
+ pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
}
if (IS_ERR(pages))
return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
&mtime, false, 2);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (!req)
+ return -ENOMEM;
num_pages = calc_pages_for(pos, len);
@@ -668,7 +668,7 @@ more:
truncate_inode_pages_range(inode->i_mapping, pos,
(pos+len) | (PAGE_CACHE_SIZE-1));
} else {
- pages = alloc_page_vector(num_pages);
+ pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
loff_t endoff = pos + iov->iov_len;
int got = 0;
int ret, err;
@@ -844,8 +844,7 @@ retry_snap:
if ((ret >= 0 || ret == -EIOCBQUEUED) &&
((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
|| ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, file->f_path.dentry,
- pos, pos + ret - 1, 1);
+ err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
if (err < 0)
ret = err;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeb..a81b8b662c7 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
- &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+ &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
struct ceph_snap_realm *realm = ci->i_snap_realm;
dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
memcpy(ci->i_xattrs.blob->vec.iov_base,
iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ xattr_blob = NULL;
}
inode->i_mapping->a_ops = &ceph_aops;
inode->i_mapping->backing_dev_info =
- &ceph_client(inode->i_sb)->backing_dev_info;
+ &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
/* set dir completion flag? */
if (ci->i_files == 0 && ci->i_subdirs == 0 &&
ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+ (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+ (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
dout(" marking %p complete (empty)\n", inode);
ci->i_ceph_flags |= CEPH_I_COMPLETE;
ci->i_max_offset = 2;
}
/* it may be better to set st_size in getattr instead? */
- if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+ if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
inode->i_size = ci->i_rbytes;
break;
default:
@@ -802,6 +804,37 @@ out_unlock:
}
/*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+ struct dentry *dir = dn->d_parent;
+ struct inode *inode = dn->d_parent->d_inode;
+ struct ceph_dentry_info *di;
+
+ BUG_ON(!inode);
+
+ di = ceph_dentry(dn);
+
+ spin_lock(&inode->i_lock);
+ if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }
+ di->offset = ceph_inode(inode)->i_max_offset++;
+ spin_unlock(&inode->i_lock);
+
+ spin_lock(&dcache_lock);
+ spin_lock(&dn->d_lock);
+ list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+ dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+ dn->d_u.d_child.prev, dn->d_u.d_child.next);
+ spin_unlock(&dn->d_lock);
+ spin_unlock(&dcache_lock);
+}
+
+/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
{
struct dentry *realdn;
+ BUG_ON(dn->d_inode);
+
/* dn must be unhashed */
if (!d_unhashed(dn))
d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
dn = realdn;
} else {
BUG_ON(!ceph_dentry(dn));
-
dout("dn %p attached to %p ino %llx.%llx\n",
dn, dn->d_inode, ceph_vinop(dn->d_inode));
}
if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn);
+ ceph_set_dentry_offset(dn);
out:
return dn;
}
/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dn->d_parent->d_inode;
- struct ceph_dentry_info *di;
-
- BUG_ON(!inode);
-
- di = ceph_dentry(dn);
-
- spin_lock(&inode->i_lock);
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&inode->i_lock);
-
- spin_lock(&dcache_lock);
- spin_lock(&dn->d_lock);
- list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dcache_lock);
-}
-
-/*
* Incorporate results into the local cache. This is either just
* one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
* after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
dout("fill_trace reply is empty!\n");
- if (rinfo->head->result == 0 && req->r_locked_dir) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_locked_dir);
- dout(" clearing %p complete (empty trace)\n",
- req->r_locked_dir);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- ci->i_release_count++;
- }
+ if (rinfo->head->result == 0 && req->r_locked_dir)
+ ceph_invalidate_dir_request(req);
return 0;
}
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
dn, dn->d_name.len, dn->d_name.name);
+
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
- dn->d_time = jiffies;
- ceph_dentry(dn)->lease_shared_gen = 0;
+ ceph_invalidate_dentry_lease(dn);
+
/* take overwritten dentry's readdir offset */
+ dout("dn %p gets %p offset %lld (old offset %lld)\n",
+ req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+ ceph_dentry(req->r_old_dentry)->offset);
ceph_dentry(req->r_old_dentry)->offset =
ceph_dentry(dn)->offset;
+
dn = req->r_old_dentry; /* use old_dentry */
in = dn->d_inode;
}
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
goto done;
}
req->r_dentry = dn; /* may have spliced */
- ceph_set_dentry_offset(dn);
igrab(in);
} else if (ceph_ino(in) == vino.ino &&
ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
err = PTR_ERR(dn);
goto done;
}
- ceph_set_dentry_offset(dn);
req->r_dentry = dn; /* may have spliced */
igrab(in);
rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+ if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
&ci->i_vmtruncate_work)) {
dout("ceph_queue_vmtruncate %p\n", inode);
igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
struct inode *parent_inode = dentry->d_parent->d_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
- struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
int issued;
int release = 0, dirtied = 0;
int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae6284..d085f07756b 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_ioctl_dataloc dl;
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+ struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
u64 len = 1, olen;
u64 tmp;
struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e0..885aa5710cf 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head);
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
/*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
- if (IS_ERR(msg)) {
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
+ if (!msg) {
pr_err("create_session_msg ENOMEM creating msg\n");
- return ERR_PTR(PTR_ERR(msg));
+ return NULL;
}
h = msg->front.iov_base;
h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
int mstate;
int mds = session->s_mds;
- int err = 0;
/* wait for mds to go active? */
mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
/* send connect message */
msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
- if (IS_ERR(msg)) {
- err = PTR_ERR(msg);
- goto out;
- }
+ if (!msg)
+ return -ENOMEM;
ceph_con_send(&session->s_con, msg);
-
-out:
return 0;
}
@@ -804,12 +799,49 @@ out:
}
static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
- void *arg)
+ void *arg)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = 0;
+
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
- ceph_remove_cap(cap);
+ spin_lock(&inode->i_lock);
+ __ceph_remove_cap(cap);
+ if (!__ceph_is_any_real_caps(ci)) {
+ struct ceph_mds_client *mdsc =
+ &ceph_sb_to_client(inode->i_sb)->mdsc;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ pr_info(" dropping dirty %s state for %p %lld\n",
+ ceph_cap_string(ci->i_dirty_caps),
+ inode, ceph_ino(inode));
+ ci->i_dirty_caps = 0;
+ list_del_init(&ci->i_dirty_item);
+ drop = 1;
+ }
+ if (!list_empty(&ci->i_flushing_item)) {
+ pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+ ceph_cap_string(ci->i_flushing_caps),
+ inode, ceph_ino(inode));
+ ci->i_flushing_caps = 0;
+ list_del_init(&ci->i_flushing_item);
+ mdsc->num_cap_flushing--;
+ drop = 1;
+ }
+ if (drop && ci->i_wrbuffer_ref) {
+ pr_info(" dropping dirty data for %p %lld\n",
+ inode, ceph_ino(inode));
+ ci->i_wrbuffer_ref = 0;
+ ci->i_wrbuffer_ref_head = 0;
+ drop++;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
+ spin_unlock(&inode->i_lock);
+ while (drop--)
+ iput(inode);
return 0;
}
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, NULL);
BUG_ON(session->s_nr_caps > 0);
+ BUG_ON(!list_empty(&session->s_cap_flushing));
cleanup_cap_releases(session);
}
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
ceph_mds_state_name(state));
msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
+ if (!msg)
+ return -ENOMEM;
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_msg *msg;
- int err = 0;
dout("request_close_session mds%d state %s seq %lld\n",
session->s_mds, session_state_name(session->s_state),
session->s_seq);
msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
- if (IS_ERR(msg))
- err = PTR_ERR(msg);
- else
- ceph_con_send(&session->s_con, msg);
- return err;
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
}
/*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
while (session->s_num_cap_releases < session->s_nr_caps + extra) {
spin_unlock(&session->s_cap_lock);
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
- 0, 0, NULL);
+ GFP_NOFS);
if (!msg)
goto out_unlocked;
dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
dout("send_cap_releases mds%d\n", session->s_mds);
- while (1) {
- spin_lock(&session->s_cap_lock);
- if (list_empty(&session->s_cap_releases_done))
- break;
+ spin_lock(&session->s_cap_lock);
+ while (!list_empty(&session->s_cap_releases_done)) {
msg = list_first_entry(&session->s_cap_releases_done,
struct ceph_msg, list_head);
list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg);
+ spin_lock(&session->s_cap_lock);
}
spin_unlock(&session->s_cap_lock);
}
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ unsigned num;
+
+ dout("discard_cap_releases mds%d\n", session->s_mds);
+ spin_lock(&session->s_cap_lock);
+
+ /* zero out the in-progress message */
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+ head->num = cpu_to_le32(0);
+ session->s_num_cap_releases += num;
+
+ /* requeue completed messages */
+ while (!list_empty(&session->s_cap_releases_done)) {
+ msg = list_first_entry(&session->s_cap_releases_done,
+ struct ceph_msg, list_head);
+ list_del_init(&msg->list_head);
+
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+ num);
+ session->s_num_cap_releases += num;
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ list_add(&msg->list_head, &session->s_cap_releases);
+ }
+
+ spin_unlock(&session->s_cap_lock);
+}
+
/*
* requests
*/
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
if (!req)
return ERR_PTR(-ENOMEM);
+ mutex_init(&req->r_fill_mutex);
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
len += 1 + temp->d_name.len;
temp = temp->d_parent;
if (temp == NULL) {
- pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+ pr_err("build_path corrupt dentry %p\n", dentry);
return ERR_PTR(-EINVAL);
}
}
@@ -1267,7 +1336,7 @@ retry:
struct inode *inode = temp->d_inode;
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
- dout("build_path_dentry path+%d: %p SNAPDIR\n",
+ dout("build_path path+%d: %p SNAPDIR\n",
pos, temp);
} else if (stop_on_nosnap && inode &&
ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
break;
strncpy(path + pos, temp->d_name.name,
temp->d_name.len);
- dout("build_path_dentry path+%d: %p '%.*s'\n",
- pos, temp, temp->d_name.len, path + pos);
}
if (pos)
path[--pos] = '/';
temp = temp->d_parent;
if (temp == NULL) {
- pr_err("build_path_dentry corrupt dentry\n");
+ pr_err("build_path corrupt dentry\n");
kfree(path);
return ERR_PTR(-EINVAL);
}
}
if (pos != 0) {
- pr_err("build_path_dentry did not end path lookup where "
+ pr_err("build_path did not end path lookup where "
"expected, namelen is %d, pos is %d\n", len, pos);
/* presumably this is only possible if racing with a
rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
- dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+ dout("build_path on %p %d built %llx '%.*s'\n",
dentry, atomic_read(&dentry->d_count), *base, len, path);
return path;
}
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
- if (IS_ERR(msg))
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
+ if (!msg) {
+ msg = ERR_PTR(-ENOMEM);
goto out_free2;
+ }
msg->hdr.tid = cpu_to_le64(req->r_tid);
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
}
msg = create_request_message(mdsc, req, mds);
if (IS_ERR(msg)) {
- req->r_reply = ERR_PTR(PTR_ERR(msg));
+ req->r_err = PTR_ERR(msg);
complete_request(mdsc, req);
- return -PTR_ERR(msg);
+ return PTR_ERR(msg);
}
req->r_request = msg;
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
int mds = -1;
int err = -EAGAIN;
- if (req->r_reply)
+ if (req->r_err || req->r_got_result)
goto out;
if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
return err;
finish:
- req->r_reply = ERR_PTR(err);
+ req->r_err = err;
complete_request(mdsc, req);
goto out;
}
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
/*
* Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds. If @all is set,
- * wake up if their requests has been forwarded to @mds, too.
+ * resubmit their requests to a possibly different mds.
*/
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
{
struct ceph_mds_request *req;
struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
__register_request(mdsc, req, dir);
__do_request(mdsc, req);
- /* wait */
- if (!req->r_reply) {
- mutex_unlock(&mdsc->mutex);
- if (req->r_timeout) {
- err = (long)wait_for_completion_interruptible_timeout(
- &req->r_completion, req->r_timeout);
- if (err == 0)
- req->r_reply = ERR_PTR(-EIO);
- else if (err < 0)
- req->r_reply = ERR_PTR(err);
- } else {
- err = wait_for_completion_interruptible(
- &req->r_completion);
- if (err)
- req->r_reply = ERR_PTR(err);
- }
- mutex_lock(&mdsc->mutex);
+ if (req->r_err) {
+ err = req->r_err;
+ __unregister_request(mdsc, req);
+ dout("do_request early error %d\n", err);
+ goto out;
}
- if (IS_ERR(req->r_reply)) {
- err = PTR_ERR(req->r_reply);
- req->r_reply = NULL;
+ /* wait */
+ mutex_unlock(&mdsc->mutex);
+ dout("do_request waiting\n");
+ if (req->r_timeout) {
+ err = (long)wait_for_completion_interruptible_timeout(
+ &req->r_completion, req->r_timeout);
+ if (err == 0)
+ err = -EIO;
+ } else {
+ err = wait_for_completion_interruptible(&req->r_completion);
+ }
+ dout("do_request waited, got %d\n", err);
+ mutex_lock(&mdsc->mutex);
- if (err == -ERESTARTSYS) {
- /* aborted */
- req->r_aborted = true;
+ /* only abort if we didn't race with a real reply */
+ if (req->r_got_result) {
+ err = le32_to_cpu(req->r_reply_info.head->result);
+ } else if (err < 0) {
+ dout("aborted request %lld with %d\n", req->r_tid, err);
- if (req->r_locked_dir &&
- (req->r_op & CEPH_MDS_OP_WRITE)) {
- struct ceph_inode_info *ci =
- ceph_inode(req->r_locked_dir);
+ /*
+ * ensure we aren't running concurrently with
+ * ceph_fill_trace or ceph_readdir_prepopulate, which
+ * rely on locks (dir mutex) held by our caller.
+ */
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = err;
+ req->r_aborted = true;
+ mutex_unlock(&req->r_fill_mutex);
- dout("aborted, clearing I_COMPLETE on %p\n",
- req->r_locked_dir);
- spin_lock(&req->r_locked_dir->i_lock);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- ci->i_release_count++;
- spin_unlock(&req->r_locked_dir->i_lock);
- }
- } else {
- /* clean up this request */
- __unregister_request(mdsc, req);
- if (!list_empty(&req->r_unsafe_item))
- list_del_init(&req->r_unsafe_item);
- complete(&req->r_safe_completion);
- }
- } else if (req->r_err) {
- err = req->r_err;
+ if (req->r_locked_dir &&
+ (req->r_op & CEPH_MDS_OP_WRITE))
+ ceph_invalidate_dir_request(req);
} else {
- err = le32_to_cpu(req->r_reply_info.head->result);
+ err = req->r_err;
}
- mutex_unlock(&mdsc->mutex);
+out:
+ mutex_unlock(&mdsc->mutex);
dout("do_request %p done, result %d\n", req, err);
return err;
}
/*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+ struct inode *inode = req->r_locked_dir;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+ spin_lock(&inode->i_lock);
+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ ci->i_release_count++;
+ spin_unlock(&inode->i_lock);
+
+ if (req->r_dentry)
+ ceph_invalidate_dentry_lease(req->r_dentry);
+ if (req->r_old_dentry)
+ ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+
+/*
* Handle mds reply.
*
* We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
}
+ if (req->r_got_safe && !head->safe) {
+ pr_warning("got unsafe after safe on %llu from mds%d\n",
+ tid, mds);
+ mutex_unlock(&mdsc->mutex);
+ goto out;
+ }
result = le32_to_cpu(head->result);
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
}
- }
-
- BUG_ON(req->r_reply);
-
- if (!head->safe) {
+ } else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
}
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
/* insert trace into our cache */
+ mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(&req->r_caps_reservation);
}
+ mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
out_err:
- if (err) {
- req->r_err = err;
+ mutex_lock(&mdsc->mutex);
+ if (!req->r_aborted) {
+ if (err) {
+ req->r_err = err;
+ } else {
+ req->r_reply = msg;
+ ceph_msg_get(msg);
+ req->r_got_result = true;
+ }
} else {
- req->r_reply = msg;
- ceph_msg_get(msg);
+ dout("reply arrived after request %lld was aborted\n", tid);
}
+ mutex_unlock(&mdsc->mutex);
add_cap_releases(mdsc, req->r_session, -1);
mutex_unlock(&session->s_mutex);
@@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session,
switch (op) {
case CEPH_SESSION_OPEN:
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info("mds%d reconnect success\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_OPEN;
renewed_caps(mdsc, session, 0);
wake = 1;
@@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_CLOSE:
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info("mds%d reconnect denied\n", session->s_mds);
remove_session_caps(session);
wake = 1; /* for good measure */
complete(&mdsc->session_close_waiters);
- kick_requests(mdsc, mds, 0); /* cur only */
+ kick_requests(mdsc, mds);
break;
case CEPH_SESSION_STALE:
@@ -2132,54 +2229,44 @@ out:
*
* called with mdsc->mutex held.
*/
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
- struct ceph_mds_session *session = NULL;
struct ceph_msg *reply;
struct rb_node *p;
+ int mds = session->s_mds;
int err = -ENOMEM;
struct ceph_pagelist *pagelist;
- pr_info("reconnect to recovering mds%d\n", mds);
+ pr_info("mds%d reconnect start\n", mds);
pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
if (!pagelist)
goto fail_nopagelist;
ceph_pagelist_init(pagelist);
- reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
- if (IS_ERR(reply)) {
- err = PTR_ERR(reply);
+ reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
+ if (!reply)
goto fail_nomsg;
- }
-
- /* find session */
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex); /* drop lock for duration */
- if (session) {
- mutex_lock(&session->s_mutex);
+ mutex_lock(&session->s_mutex);
+ session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+ session->s_seq = 0;
- session->s_state = CEPH_MDS_SESSION_RECONNECTING;
- session->s_seq = 0;
+ ceph_con_open(&session->s_con,
+ ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
- ceph_con_open(&session->s_con,
- ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-
- /* replay unsafe requests */
- replay_unsafe_requests(mdsc, session);
- } else {
- dout("no session for mds%d, will send short reconnect\n",
- mds);
- }
+ /* replay unsafe requests */
+ replay_unsafe_requests(mdsc, session);
down_read(&mdsc->snap_rwsem);
- if (!session)
- goto send;
dout("session %p state %s\n", session,
session_state_name(session->s_state));
+ /* drop old cap expires; we're about to reestablish that state */
+ discard_cap_releases(mdsc, session);
+
/* traverse this session's caps */
err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
if (err)
@@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
goto fail;
}
-send:
reply->pagelist = pagelist;
reply->hdr.data_len = cpu_to_le32(pagelist->length);
reply->nr_pages = calc_pages_for(0, pagelist->length);
ceph_con_send(&session->s_con, reply);
- session->s_state = CEPH_MDS_SESSION_OPEN;
mutex_unlock(&session->s_mutex);
mutex_lock(&mdsc->mutex);
__wake_requests(mdsc, &session->s_waiting);
mutex_unlock(&mdsc->mutex);
- ceph_put_mds_session(session);
-
up_read(&mdsc->snap_rwsem);
- mutex_lock(&mdsc->mutex);
return;
fail:
ceph_msg_put(reply);
up_read(&mdsc->snap_rwsem);
mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
fail_nomsg:
ceph_pagelist_release(pagelist);
kfree(pagelist);
fail_nopagelist:
pr_err("error %d preparing reconnect for mds%d\n", err, mds);
- mutex_lock(&mdsc->mutex);
return;
}
@@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
/* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i, 1);
+ kick_requests(mdsc, i);
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
* send reconnect?
*/
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
- newstate >= CEPH_MDS_STATE_RECONNECT)
- send_mds_reconnect(mdsc, i);
+ newstate >= CEPH_MDS_STATE_RECONNECT) {
+ mutex_unlock(&mdsc->mutex);
+ send_mds_reconnect(mdsc, s);
+ mutex_lock(&mdsc->mutex);
+ }
/*
- * kick requests on any mds that has gone active.
- *
- * kick requests on cur or forwarder: we may have sent
- * the request to mds1, mds1 told us it forwarded it
- * to mds2, but then we learn mds1 failed and can't be
- * sure it successfully forwarded our request before
- * it died.
+ * kick request on any mds that has gone active.
*/
if (oldstate < CEPH_MDS_STATE_ACTIVE &&
newstate >= CEPH_MDS_STATE_ACTIVE) {
- pr_info("mds%d reconnect completed\n", s->s_mds);
- kick_requests(mdsc, i, 1);
+ if (oldstate != CEPH_MDS_STATE_CREATING &&
+ oldstate != CEPH_MDS_STATE_STARTING)
+ pr_info("mds%d recovery completed\n", s->s_mds);
+ kick_requests(mdsc, i);
ceph_kick_flushing_caps(mdsc, s);
wake_up_session_caps(s, 1);
}
@@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
dnamelen = dentry->d_name.len;
len += dnamelen;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
- if (IS_ERR(msg))
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
+ if (!msg)
return;
lease = msg->front.iov_base;
lease->action = action;
@@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work)
else
ceph_con_keepalive(&s->s_con);
add_cap_releases(mdsc, s, -1);
- send_cap_releases(mdsc, s);
+ if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+ s->s_state == CEPH_MDS_SESSION_HUNG)
+ send_cap_releases(mdsc, s);
mutex_unlock(&s->s_mutex);
ceph_put_mds_session(s);
@@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
mdsc->client = client;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+ if (mdsc->mdsmap == NULL)
+ return -ENOMEM;
+
init_completion(&mdsc->safe_umount_waiters);
init_completion(&mdsc->session_close_waiters);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
init_waitqueue_head(&mdsc->cap_flushing_wq);
spin_lock_init(&mdsc->dentry_lru_lock);
INIT_LIST_HEAD(&mdsc->dentry_lru);
+
return 0;
}
@@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush;
+ if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+ return;
+
dout("sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
@@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con)
static void peer_reset(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
- s->s_mds);
+ pr_warning("mds%d closed our session\n", s->s_mds);
+ send_mds_reconnect(mdsc, s);
}
static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&mdsc->client->monc);
}
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
.get = con_get,
.put = con_put,
.dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f6587..d9936c4f121 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
struct inode *r_target_inode; /* resulting inode */
+ struct mutex r_fill_mutex;
+
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
@@ -213,7 +215,7 @@ struct ceph_mds_request {
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
struct list_head r_unsafe_item; /* per-session unsafe list item */
- bool r_got_unsafe, r_got_safe;
+ bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate;
u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct inode *inode,
struct dentry *dn, int mask);
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491..60b74839ebe 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
static void con_work(struct work_struct *);
static void ceph_fault(struct ceph_connection *con);
-const char *ceph_name_type_str(int t)
-{
- switch (t) {
- case CEPH_ENTITY_TYPE_MON: return "mon";
- case CEPH_ENTITY_TYPE_MDS: return "mds";
- case CEPH_ENTITY_TYPE_OSD: return "osd";
- case CEPH_ENTITY_TYPE_CLIENT: return "client";
- case CEPH_ENTITY_TYPE_ADMIN: return "admin";
- default: return "???";
- }
-}
-
/*
* nicely render a sockaddr as a string.
*/
@@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con)
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
+ con->out_keepalive_pending = false;
con->in_seq = 0;
con->in_seq_acked = 0;
}
@@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con)
clear_bit(WRITE_PENDING, &con->state);
mutex_lock(&con->mutex);
reset_connection(con);
+ con->peer_global_seq = 0;
cancel_delayed_work(&con->work);
mutex_unlock(&con->mutex);
queue_con(con);
@@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto);
- con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+ con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con)
static int process_connect(struct ceph_connection *con)
{
- u64 sup_feat = CEPH_FEATURE_SUPPORTED;
- u64 req_feat = CEPH_FEATURE_REQUIRED;
+ u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+ u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
u64 server_feat = le64_to_cpu(con->in_reply.features);
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con)
clear_bit(CONNECTING, &con->state);
con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
con->connect_seq++;
+ con->peer_features = server_feat;
dout("process_connect got READY gseq %d cseq %d (%d)\n",
con->peer_global_seq,
le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con)
con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
if (skip) {
/* skip this message */
- dout("alloc_msg returned NULL, skipping message\n");
+ dout("alloc_msg said skip message\n");
con->in_base_pos = -front_len - middle_len - data_len -
sizeof(m->footer);
con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++;
return 0;
}
- if (IS_ERR(con->in_msg)) {
- ret = PTR_ERR(con->in_msg);
- con->in_msg = NULL;
+ if (!con->in_msg) {
con->error_msg =
"error allocating memory for incoming message";
- return ret;
+ return -ENOMEM;
}
m = con->in_msg;
m->front.iov_len = 0; /* haven't read it yet */
@@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con)
/* if first message, set peer_name */
if (con->peer_name.type == 0)
- con->peer_name = msg->hdr.src.name;
+ con->peer_name = msg->hdr.src;
con->in_seq++;
mutex_unlock(&con->mutex);
dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
msg, le64_to_cpu(msg->hdr.seq),
- ENTITY_NAME(msg->hdr.src.name),
+ ENTITY_NAME(msg->hdr.src),
le16_to_cpu(msg->hdr.type),
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con)
dout("try_write start %p state %lu nref %d\n", con, con->state,
atomic_read(&con->nref));
- mutex_lock(&con->mutex);
more:
dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -1639,7 +1627,6 @@ do_next:
done:
ret = 0;
out:
- mutex_unlock(&con->mutex);
dout("try_write done on %p\n", con);
return ret;
}
@@ -1651,7 +1638,6 @@ out:
*/
static int try_read(struct ceph_connection *con)
{
- struct ceph_messenger *msgr;
int ret = -1;
if (!con->sock)
@@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con)
return 0;
dout("try_read start on %p\n", con);
- msgr = con->msgr;
-
- mutex_lock(&con->mutex);
more:
dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1741,6 @@ more:
done:
ret = 0;
out:
- mutex_unlock(&con->mutex);
dout("try_read done on %p\n", con);
return ret;
@@ -1830,6 +1812,8 @@ more:
dout("con_work %p start, clearing QUEUED\n", con);
clear_bit(QUEUED, &con->state);
+ mutex_lock(&con->mutex);
+
if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
dout("con_work CLOSED\n");
con_close_socket(con);
@@ -1844,11 +1828,16 @@ more:
if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
try_read(con) < 0 ||
try_write(con) < 0) {
+ mutex_unlock(&con->mutex);
backoff = 1;
ceph_fault(con); /* error/fault path */
+ goto done_unlocked;
}
done:
+ mutex_unlock(&con->mutex);
+
+done_unlocked:
clear_bit(BUSY, &con->state);
dout("con->state=%lu\n", con->state);
if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
/* the zero page is needed if a request is "canceled" while the message
* is being written over the socket */
- msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
if (!msgr->zero_page) {
kfree(msgr);
return ERR_PTR(-ENOMEM);
@@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
}
/* set src+dst */
- msg->hdr.src.name = con->msgr->inst.name;
- msg->hdr.src.addr = con->msgr->my_enc_addr;
- msg->hdr.orig_src = msg->hdr.src;
+ msg->hdr.src = con->msgr->inst.name;
BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
@@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
* construct a new message with given type, size
* the new msg has a ref count of 1.
*/
-struct ceph_msg *ceph_msg_new(int type, int front_len,
- int page_len, int page_off, struct page **pages)
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
{
struct ceph_msg *m;
- m = kmalloc(sizeof(*m), GFP_NOFS);
+ m = kmalloc(sizeof(*m), flags);
if (m == NULL)
goto out;
kref_init(&m->kref);
@@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
m->hdr.version = 0;
m->hdr.front_len = cpu_to_le32(front_len);
m->hdr.middle_len = 0;
- m->hdr.data_len = cpu_to_le32(page_len);
- m->hdr.data_off = cpu_to_le16(page_off);
+ m->hdr.data_len = 0;
+ m->hdr.data_off = 0;
m->hdr.reserved = 0;
m->footer.front_crc = 0;
m->footer.middle_crc = 0;
@@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
/* front */
if (front_len) {
if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+ m->front.iov_base = __vmalloc(front_len, flags,
PAGE_KERNEL);
m->front_is_vmalloc = true;
} else {
- m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+ m->front.iov_base = kmalloc(front_len, flags);
}
if (m->front.iov_base == NULL) {
pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
m->middle = NULL;
/* data */
- m->nr_pages = calc_pages_for(page_off, page_len);
- m->pages = pages;
+ m->nr_pages = 0;
+ m->pages = NULL;
m->pagelist = NULL;
- dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
- m->nr_pages);
+ dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
out2:
ceph_msg_put(m);
out:
- pr_err("msg_new can't create type %d len %d\n", type, front_len);
- return ERR_PTR(-ENOMEM);
+ pr_err("msg_new can't create type %d front %d\n", type, front_len);
+ return NULL;
}
/*
@@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex);
- if (IS_ERR(msg))
- return msg;
-
- if (*skip)
+ if (!msg || *skip)
return NULL;
}
if (!msg) {
*skip = 0;
- msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+ msg = ceph_msg_new(type, front_len, GFP_NOFS);
if (!msg) {
pr_err("unable to allocate msg type %d len %d\n",
type, front_len);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
}
memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
- if (middle_len) {
+ if (middle_len && !msg->middle) {
ret = ceph_alloc_middle(con, msg);
-
if (ret < 0) {
ceph_msg_put(msg);
- return msg;
+ return NULL;
}
}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc97..00a9430b1ff 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
int *skip);
};
-extern const char *ceph_name_type_str(int t);
-
/* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
struct ceph_messenger {
struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
struct ceph_entity_addr peer_addr; /* peer address */
struct ceph_entity_name peer_name; /* peer name */
struct ceph_entity_addr peer_addr_for_me;
+ unsigned peer_features;
u32 connect_seq; /* identify the most recent connection
attempt for this connection, client */
u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
struct list_head out_queue;
struct list_head out_sent; /* sending or sent but unacked */
u64 out_seq; /* last message queued for send */
- u64 out_seq_sent; /* last message sent */
bool out_keepalive_pending;
u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
- int page_len, int page_off,
- struct page **pages);
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca95..f6510a476e7 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
* resend any outstanding requests.
*/
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc);
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
monc->pending_auth = 1;
monc->m_auth->front.iov_len = len;
monc->m_auth->hdr.front_len = cpu_to_le32(len);
+ ceph_con_revoke(monc->con, monc->m_auth);
ceph_msg_get(monc->m_auth); /* keep our ref */
ceph_con_send(monc->con, monc->m_auth);
}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
monc->want_next_osdmap);
if ((__sub_expired(monc) && !monc->sub_sent) ||
monc->want_next_osdmap == 1) {
- struct ceph_msg *msg;
+ struct ceph_msg *msg = monc->m_subscribe;
struct ceph_mon_subscribe_item *i;
void *p, *end;
- msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
- if (!msg)
- return;
-
p = msg->front.iov_base;
- end = p + msg->front.iov_len;
+ end = p + msg->front_max;
dout("__send_subscribe to 'mdsmap' %u+\n",
(unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- ceph_con_send(monc->con, msg);
+ ceph_con_revoke(monc->con, msg);
+ ceph_con_send(monc->con, ceph_msg_get(msg));
monc->sub_sent = jiffies | 1; /* never 0 */
}
@@ -353,14 +351,14 @@ out:
/*
* statfs
*/
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
struct ceph_mon_client *monc, u64 tid)
{
- struct ceph_mon_statfs_request *req;
- struct rb_node *n = monc->statfs_request_tree.rb_node;
+ struct ceph_mon_generic_request *req;
+ struct rb_node *n = monc->generic_request_tree.rb_node;
while (n) {
- req = rb_entry(n, struct ceph_mon_statfs_request, node);
+ req = rb_entry(n, struct ceph_mon_generic_request, node);
if (tid < req->tid)
n = n->rb_left;
else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
return NULL;
}
-static void __insert_statfs(struct ceph_mon_client *monc,
- struct ceph_mon_statfs_request *new)
+static void __insert_generic_request(struct ceph_mon_client *monc,
+ struct ceph_mon_generic_request *new)
{
- struct rb_node **p = &monc->statfs_request_tree.rb_node;
+ struct rb_node **p = &monc->generic_request_tree.rb_node;
struct rb_node *parent = NULL;
- struct ceph_mon_statfs_request *req = NULL;
+ struct ceph_mon_generic_request *req = NULL;
while (*p) {
parent = *p;
- req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+ req = rb_entry(parent, struct ceph_mon_generic_request, node);
if (new->tid < req->tid)
p = &(*p)->rb_left;
else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
}
rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, &monc->statfs_request_tree);
+ rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+
+static void release_generic_request(struct kref *kref)
+{
+ struct ceph_mon_generic_request *req =
+ container_of(kref, struct ceph_mon_generic_request, kref);
+
+ if (req->reply)
+ ceph_msg_put(req->reply);
+ if (req->request)
+ ceph_msg_put(req->request);
+}
+
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_put(&req->kref, release_generic_request);
+}
+
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+ kref_get(&req->kref);
+}
+
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_mon_client *monc = con->private;
+ struct ceph_mon_generic_request *req;
+ u64 tid = le64_to_cpu(hdr->tid);
+ struct ceph_msg *m;
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_generic_req(monc, tid);
+ if (!req) {
+ dout("get_generic_reply %lld dne\n", tid);
+ *skip = 1;
+ m = NULL;
+ } else {
+ dout("get_generic_reply %lld got %p\n", tid, req->reply);
+ m = ceph_msg_get(req->reply);
+ /*
+ * we don't need to track the connection reading into
+ * this reply because we only have one open connection
+ * at a time, ever.
+ */
+ }
+ mutex_unlock(&monc->mutex);
+ return m;
}
static void handle_statfs_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
- struct ceph_mon_statfs_request *req;
+ struct ceph_mon_generic_request *req;
struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
- u64 tid;
+ u64 tid = le64_to_cpu(msg->hdr.tid);
if (msg->front.iov_len != sizeof(*reply))
goto bad;
- tid = le64_to_cpu(msg->hdr.tid);
dout("handle_statfs_reply %p tid %llu\n", msg, tid);
mutex_lock(&monc->mutex);
- req = __lookup_statfs(monc, tid);
+ req = __lookup_generic_req(monc, tid);
if (req) {
- *req->buf = reply->st;
+ *(struct ceph_statfs *)req->buf = reply->st;
req->result = 0;
+ get_generic_request(req);
}
mutex_unlock(&monc->mutex);
- if (req)
+ if (req) {
complete(&req->completion);
+ put_generic_request(req);
+ }
return;
bad:
- pr_err("corrupt statfs reply, no tid\n");
+ pr_err("corrupt generic reply, no tid\n");
ceph_msg_dump(msg);
}
/*
- * (re)send a statfs request
+ * Do a synchronous statfs().
*/
-static int send_statfs(struct ceph_mon_client *monc,
- struct ceph_mon_statfs_request *req)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
{
- struct ceph_msg *msg;
+ struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
+ int err;
- dout("send_statfs tid %llu\n", req->tid);
- msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
- req->request = msg;
- msg->hdr.tid = cpu_to_le64(req->tid);
- h = msg->front.iov_base;
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ if (!req)
+ return -ENOMEM;
+
+ kref_init(&req->kref);
+ req->buf = buf;
+ init_completion(&req->completion);
+
+ err = -ENOMEM;
+ req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+ if (!req->request)
+ goto out;
+ req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+ if (!req->reply)
+ goto out;
+
+ /* fill out request */
+ h = req->request->front.iov_base;
h->monhdr.have_version = 0;
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
- ceph_con_send(monc->con, msg);
- return 0;
-}
-
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
- struct ceph_mon_statfs_request req;
- int err;
-
- req.buf = buf;
- init_completion(&req.completion);
-
- /* allocate memory for reply */
- err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
- if (err)
- return err;
/* register request */
mutex_lock(&monc->mutex);
- req.tid = ++monc->last_tid;
- req.last_attempt = jiffies;
- req.delay = BASE_DELAY_INTERVAL;
- __insert_statfs(monc, &req);
- monc->num_statfs_requests++;
+ req->tid = ++monc->last_tid;
+ req->request->hdr.tid = cpu_to_le64(req->tid);
+ __insert_generic_request(monc, req);
+ monc->num_generic_requests++;
mutex_unlock(&monc->mutex);
/* send request and wait */
- err = send_statfs(monc, &req);
- if (!err)
- err = wait_for_completion_interruptible(&req.completion);
+ ceph_con_send(monc->con, ceph_msg_get(req->request));
+ err = wait_for_completion_interruptible(&req->completion);
mutex_lock(&monc->mutex);
- rb_erase(&req.node, &monc->statfs_request_tree);
- monc->num_statfs_requests--;
- ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+ rb_erase(&req->node, &monc->generic_request_tree);
+ monc->num_generic_requests--;
mutex_unlock(&monc->mutex);
if (!err)
- err = req.result;
+ err = req->result;
+
+out:
+ kref_put(&req->kref, release_generic_request);
return err;
}
/*
* Resend pending statfs requests.
*/
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
{
- struct ceph_mon_statfs_request *req;
+ struct ceph_mon_generic_request *req;
struct rb_node *p;
- for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
- req = rb_entry(p, struct ceph_mon_statfs_request, node);
- send_statfs(monc, req);
+ for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_mon_generic_request, node);
+ ceph_con_revoke(monc->con, req->request);
+ ceph_con_send(monc->con, ceph_msg_get(req->request));
}
}
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
- /* msg pools */
- err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
- sizeof(struct ceph_mon_subscribe_ack), 1, false);
- if (err < 0)
+ /* msgs */
+ err = -ENOMEM;
+ monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
+ sizeof(struct ceph_mon_subscribe_ack),
+ GFP_NOFS);
+ if (!monc->m_subscribe_ack)
goto out_monmap;
- err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
- sizeof(struct ceph_mon_statfs_reply), 0, false);
- if (err < 0)
- goto out_pool1;
- err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
- if (err < 0)
- goto out_pool2;
-
- monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+
+ monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
+ if (!monc->m_subscribe)
+ goto out_subscribe_ack;
+
+ monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
+ if (!monc->m_auth_reply)
+ goto out_subscribe;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
monc->pending_auth = 0;
- if (IS_ERR(monc->m_auth)) {
- err = PTR_ERR(monc->m_auth);
- monc->m_auth = NULL;
- goto out_pool3;
- }
+ if (!monc->m_auth)
+ goto out_auth_reply;
monc->cur_mon = -1;
monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
monc->sub_sent = 0;
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
- monc->statfs_request_tree = RB_ROOT;
- monc->num_statfs_requests = 0;
+ monc->generic_request_tree = RB_ROOT;
+ monc->num_generic_requests = 0;
monc->last_tid = 0;
monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
monc->want_next_osdmap = 1;
return 0;
-out_pool3:
- ceph_msgpool_destroy(&monc->msgpool_auth_reply);
-out_pool2:
- ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
-out_pool1:
- ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_auth_reply:
+ ceph_msg_put(monc->m_auth_reply);
+out_subscribe:
+ ceph_msg_put(monc->m_subscribe);
+out_subscribe_ack:
+ ceph_msg_put(monc->m_subscribe_ack);
out_monmap:
kfree(monc->monmap);
out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
ceph_auth_destroy(monc->auth);
ceph_msg_put(monc->m_auth);
- ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
- ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
- ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+ ceph_msg_put(monc->m_auth_reply);
+ ceph_msg_put(monc->m_subscribe);
+ ceph_msg_put(monc->m_subscribe_ack);
kfree(monc->monmap);
}
@@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
monc->client->msgr->inst.name.num = monc->auth->global_id;
__send_subscribe(monc);
- __resend_statfs(monc);
+ __resend_generic_request(monc);
}
mutex_unlock(&monc->mutex);
}
@@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
switch (type) {
case CEPH_MSG_MON_SUBSCRIBE_ACK:
- m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+ m = ceph_msg_get(monc->m_subscribe_ack);
break;
case CEPH_MSG_STATFS_REPLY:
- m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
- break;
+ return get_generic_reply(con, hdr, skip);
case CEPH_MSG_AUTH_REPLY:
- m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+ m = ceph_msg_get(monc->m_auth_reply);
break;
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
- m = ceph_msg_new(type, front_len, 0, 0, NULL);
+ m = ceph_msg_new(type, front_len, GFP_NOFS);
break;
}
@@ -826,7 +867,7 @@ out:
mutex_unlock(&monc->mutex);
}
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
.get = ceph_con_get,
.put = ceph_con_put,
.dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa0..174d794321d 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
#define _FS_CEPH_MON_CLIENT_H
#include <linux/completion.h>
+#include <linux/kref.h>
#include <linux/rbtree.h>
#include "messenger.h"
-#include "msgpool.h"
struct ceph_client;
struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
};
struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
};
/*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
* to the caller
*/
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+ struct kref kref;
u64 tid;
struct rb_node node;
int result;
- struct ceph_statfs *buf;
+ void *buf;
struct completion completion;
- unsigned long last_attempt, delay; /* jiffies */
struct ceph_msg *request; /* original request */
+ struct ceph_msg *reply; /* and reply */
};
struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
struct delayed_work delayed_work;
struct ceph_auth_client *auth;
- struct ceph_msg *m_auth;
+ struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
int pending_auth;
bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
struct ceph_connection *con;
bool have_fsid;
- /* msg pools */
- struct ceph_msgpool msgpool_subscribe_ack;
- struct ceph_msgpool msgpool_statfs_reply;
- struct ceph_msgpool msgpool_auth_reply;
-
- /* pending statfs requests */
- struct rb_root statfs_request_tree;
- int num_statfs_requests;
+ /* pending generic requests */
+ struct rb_root generic_request_tree;
+ int num_generic_requests;
u64 last_tid;
/* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2..dd65a643813 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
#include "msgpool.h"
-/*
- * We use msg pools to preallocate memory for messages we expect to
- * receive over the wire, to avoid getting ourselves into OOM
- * conditions at unexpected times. We take use a few different
- * strategies:
- *
- * - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- * - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- * - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector. This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
+{
+ struct ceph_msgpool *pool = arg;
+ void *p;
+ p = ceph_msg_new(0, pool->front_len, gfp_mask);
+ if (!p)
+ pr_err("msgpool %s alloc failed\n", pool->name);
+ return p;
+}
-/*
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
+static void free_fn(void *element, void *arg)
{
- struct ceph_msg *msg;
-
- while (pool->num < pool->min) {
- dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
- pool->min);
- spin_unlock(&pool->lock);
- msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
- spin_lock(&pool->lock);
- if (IS_ERR(msg))
- return PTR_ERR(msg);
- msg->pool = pool;
- list_add(&msg->list_head, &pool->msgs);
- pool->num++;
- }
- while (pool->num > pool->min) {
- msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
- dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
- pool->min, msg);
- list_del_init(&msg->list_head);
- pool->num--;
- ceph_msg_kfree(msg);
- }
- return 0;
+ ceph_msg_put(element);
}
int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int min, bool blocking)
+ int front_len, int size, bool blocking, const char *name)
{
- int ret;
-
- dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
- spin_lock_init(&pool->lock);
pool->front_len = front_len;
- INIT_LIST_HEAD(&pool->msgs);
- pool->num = 0;
- pool->min = min;
- pool->blocking = blocking;
- init_waitqueue_head(&pool->wait);
-
- spin_lock(&pool->lock);
- ret = __fill_msgpool(pool);
- spin_unlock(&pool->lock);
- return ret;
+ pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
+ if (!pool->pool)
+ return -ENOMEM;
+ pool->name = name;
+ return 0;
}
void ceph_msgpool_destroy(struct ceph_msgpool *pool)
{
- dout("msgpool_destroy %p\n", pool);
- spin_lock(&pool->lock);
- pool->min = 0;
- __fill_msgpool(pool);
- spin_unlock(&pool->lock);
+ mempool_destroy(pool->pool);
}
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+ int front_len)
{
- int ret;
-
- spin_lock(&pool->lock);
- dout("msgpool_resv %p delta %d\n", pool, delta);
- pool->min += delta;
- ret = __fill_msgpool(pool);
- spin_unlock(&pool->lock);
- return ret;
-}
-
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
- wait_queue_t wait;
- struct ceph_msg *msg;
-
- if (front_len && front_len > pool->front_len) {
- pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
- pool, front_len, pool->front_len);
+ if (front_len > pool->front_len) {
+ pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
+ pool->name, front_len, pool->front_len);
WARN_ON(1);
/* try to alloc a fresh message */
- msg = ceph_msg_new(0, front_len, 0, 0, NULL);
- if (!IS_ERR(msg))
- return msg;
- }
-
- if (!front_len)
- front_len = pool->front_len;
-
- if (pool->blocking) {
- /* mempool_t behavior; first try to alloc */
- msg = ceph_msg_new(0, front_len, 0, 0, NULL);
- if (!IS_ERR(msg))
- return msg;
+ return ceph_msg_new(0, front_len, GFP_NOFS);
}
- while (1) {
- spin_lock(&pool->lock);
- if (likely(pool->num)) {
- msg = list_entry(pool->msgs.next, struct ceph_msg,
- list_head);
- list_del_init(&msg->list_head);
- pool->num--;
- dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- return msg;
- }
- pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
- pool->min, pool->blocking ? "waiting" : "may fail");
- spin_unlock(&pool->lock);
-
- if (!pool->blocking) {
- WARN_ON(1);
-
- /* maybe we can allocate it now? */
- msg = ceph_msg_new(0, front_len, 0, 0, NULL);
- if (!IS_ERR(msg))
- return msg;
-
- pr_err("msgpool_get %p empty + alloc failed\n", pool);
- return ERR_PTR(-ENOMEM);
- }
-
- init_wait(&wait);
- prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
- schedule();
- finish_wait(&pool->wait, &wait);
- }
+ return mempool_alloc(pool->pool, GFP_NOFS);
}
void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
{
- spin_lock(&pool->lock);
- if (pool->num < pool->min) {
- /* reset msg front_len; user may have changed it */
- msg->front.iov_len = pool->front_len;
- msg->hdr.front_len = cpu_to_le32(pool->front_len);
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
- kref_set(&msg->kref, 1); /* retake a single ref */
- list_add(&msg->list_head, &pool->msgs);
- pool->num++;
- dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- wake_up(&pool->wait);
- } else {
- dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
- pool->num, pool->min);
- spin_unlock(&pool->lock);
- ceph_msg_kfree(msg);
- }
+ kref_init(&msg->kref); /* retake single ref */
}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd72..a362605f936 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
#ifndef _FS_CEPH_MSGPOOL
#define _FS_CEPH_MSGPOOL
+#include <linux/mempool.h>
#include "messenger.h"
/*
@@ -8,18 +9,15 @@
* avoid unexpected OOM conditions.
*/
struct ceph_msgpool {
- spinlock_t lock;
+ const char *name;
+ mempool_t *pool;
int front_len; /* preallocated payload size */
- struct list_head msgs; /* msgs in the pool; each has 1 ref */
- int num, min; /* cur, min # msgs in the pool */
- bool blocking;
- wait_queue_head_t wait;
};
extern int ceph_msgpool_init(struct ceph_msgpool *pool,
- int front_len, int size, bool blocking);
+ int front_len, int size, bool blocking,
+ const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
int front_len);
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f..892a0298dfd 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
#define CEPH_ENTITY_TYPE_MDS 0x02
#define CEPH_ENTITY_TYPE_OSD 0x04
#define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN 0x10
#define CEPH_ENTITY_TYPE_AUTH 0x20
#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
/*
* message header
*/
-struct ceph_msg_header {
+struct ceph_msg_header_old {
__le64 seq; /* message seq# for this session */
__le64 tid; /* transaction id */
__le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
__le32 crc; /* header crc32c */
} __attribute__ ((packed));
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_name src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
#define CEPH_MSG_PRIO_LOW 64
#define CEPH_MSG_PRIO_DEFAULT 127
#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85..afa7bb3895c 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
#define OSD_OP_FRONT_LEN 4096
#define OSD_OPREPLY_FRONT_LEN 512
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
static int __kick_requests(struct ceph_osd_client *osdc,
struct ceph_osd *kickosd);
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
req = kzalloc(sizeof(*req), GFP_NOFS);
}
if (req == NULL)
- return ERR_PTR(-ENOMEM);
+ return NULL;
req->r_osdc = osdc;
req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
else
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
- if (IS_ERR(msg)) {
+ OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
+ if (!msg) {
ceph_osdc_put_request(req);
- return ERR_PTR(PTR_ERR(msg));
+ return NULL;
}
req->r_reply = msg;
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
- if (IS_ERR(msg)) {
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
+ if (!msg) {
ceph_osdc_put_request(req);
- return ERR_PTR(PTR_ERR(msg));
+ return NULL;
}
msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work)
* should mark the osd as failed and we should find out about
* it from an updated osd map.
*/
- while (!list_empty(&osdc->req_lru)) {
+ while (timeout && !list_empty(&osdc->req_lru)) {
req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
r_req_lru_item);
@@ -1078,6 +1078,7 @@ done:
if (newmap)
kick_requests(osdc, NULL);
up_read(&osdc->map_sem);
+ wake_up(&osdc->client->auth_wq);
return;
bad:
@@ -1087,45 +1088,6 @@ bad:
return;
}
-
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- * 0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- struct ceph_osd_request *req,
- u64 tid,
- struct ceph_msg *m)
-{
- struct ceph_osd *osd = con->private;
- struct ceph_osd_client *osdc;
- int ret = -1;
- int data_len = le32_to_cpu(hdr->data_len);
- unsigned data_off = le16_to_cpu(hdr->data_off);
-
- int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-
- if (!osd)
- return -1;
-
- osdc = osd->o_osdc;
-
- dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
- tid, req->r_num_pages, want);
- if (unlikely(req->r_num_pages < want))
- goto out;
- m->pages = req->r_pages;
- m->nr_pages = req->r_num_pages;
- ret = 0; /* success */
-out:
- BUG_ON(ret < 0 || m->nr_pages < want);
-
- return ret;
-}
-
/*
* Register request, send initial attempt.
*/
@@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
if (!osdc->req_mempool)
goto out;
- err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+ err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+ "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply,
- OSD_OPREPLY_FRONT_LEN, 10, true);
+ OSD_OPREPLY_FRONT_LEN, 10, true,
+ "osd_op_reply");
if (err < 0)
goto out_msgpool;
return 0;
@@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0, truncate_seq, truncate_size, NULL,
false, 1);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (!req)
+ return -ENOMEM;
/* it may be a short read due to an object boundary */
req->r_pages = pages;
@@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
snapc, do_sync,
truncate_seq, truncate_size, mtime,
nofail, 1);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (!req)
+ return -ENOMEM;
/* it may be a short write due to an object boundary */
req->r_pages = pages;
@@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
}
/*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply. set up reply message
+ * pages.
*/
static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_msg_header *hdr,
@@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
int front = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
u64 tid;
- int err;
tid = le64_to_cpu(hdr->tid);
mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply, req->r_con_filling_msg);
ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
ceph_con_put(req->r_con_filling_msg);
+ req->r_con_filling_msg = NULL;
}
if (front > req->r_reply->front.iov_len) {
pr_warning("get_reply front %d > preallocated %d\n",
front, (int)req->r_reply->front.iov_len);
- m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
- if (IS_ERR(m))
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
+ if (!m)
goto out;
ceph_msg_put(req->r_reply);
req->r_reply = m;
@@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
m = ceph_msg_get(req->r_reply);
if (data_len > 0) {
- err = __prepare_pages(con, hdr, req, tid, m);
- if (err < 0) {
+ unsigned data_off = le16_to_cpu(hdr->data_off);
+ int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+ if (unlikely(req->r_num_pages < want)) {
+ pr_warning("tid %lld reply %d > expected %d pages\n",
+ tid, want, m->nr_pages);
*skip = 1;
ceph_msg_put(m);
- m = ERR_PTR(err);
+ m = NULL;
+ goto out;
}
+ m->pages = req->r_pages;
+ m->nr_pages = req->r_num_pages;
}
*skip = 0;
req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
switch (type) {
case CEPH_MSG_OSD_MAP:
- return ceph_msg_new(type, front, 0, 0, NULL);
+ return ceph_msg_new(type, front, GFP_NOFS);
case CEPH_MSG_OSD_OPREPLY:
return get_reply(con, hdr, skip);
default:
@@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
.get = get_osd_con,
.put = put_osd_con,
.dispatch = dispatch,
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745..b6859f47d36 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
{
- struct page *page = alloc_page(GFP_NOFS);
+ struct page *page = __page_cache_alloc(GFP_NOFS);
if (!page)
return -ENOMEM;
pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871..8fcc023056c 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
__le64 snap_seq; /* seq for per-pool snapshot */
__le32 snap_epoch; /* epoch of last snap */
__le32 num_snaps;
- __le32 num_removed_snap_intervals;
- __le64 uid;
+ __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
+ __le64 auid; /* who owns the pg */
} __attribute__ ((packed));
/*
@@ -208,6 +208,7 @@ enum {
/* read */
CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+ CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
/* write */
CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/* xattr comparison */
+enum {
+ CEPH_OSD_CMPXATTR_OP_NOP = 0,
+ CEPH_OSD_CMPXATTR_OP_EQ = 1,
+ CEPH_OSD_CMPXATTR_OP_NE = 2,
+ CEPH_OSD_CMPXATTR_OP_GT = 3,
+ CEPH_OSD_CMPXATTR_OP_GTE = 4,
+ CEPH_OSD_CMPXATTR_OP_LT = 5,
+ CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+
+enum {
+ CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+ CEPH_OSD_CMPXATTR_MODE_U64 = 2
+};
+
/*
* an individual object operation. each may be accompanied by some data
* payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
struct {
__le32 name_len;
__le32 value_len;
+ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
+ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} __attribute__ ((packed)) xattr;
struct {
__u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db7045..c0b26b6badb 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
BUG_ON(capsnap->writing);
capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba926..7c663d9b9f8 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/parser.h>
-#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
#include "decode.h"
#include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
static int ceph_syncfs(struct super_block *sb, int wait)
{
dout("sync_fs %d\n", wait);
- ceph_osdc_sync(&ceph_client(sb)->osdc);
- ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+ ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
+ ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
dout("sync_fs %d done\n", wait);
return 0;
}
+static int default_congestion_kb(void)
+{
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
+}
/**
* ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
seq_puts(m, ",nocrc");
if (args->flags & CEPH_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir");
+
+ if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+ seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+ if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+ seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+ if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+ seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+ if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+ seq_printf(m, ",osdkeepalivetimeout=%d",
+ args->osd_keepalive_timeout);
+ if (args->wsize)
+ seq_printf(m, ",wsize=%d", args->wsize);
+ if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+ seq_printf(m, ",rsize=%d", args->rsize);
+ if (args->congestion_kb != default_congestion_kb())
+ seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+ if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_min=%d",
+ args->caps_wanted_delay_min);
+ if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+ seq_printf(m, ",caps_wanted_delay_max=%d",
+ args->caps_wanted_delay_max);
+ if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+ seq_printf(m, ",cap_release_safety=%d",
+ args->cap_release_safety);
+ if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+ seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+ if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+ seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
seq_printf(m, ",snapdirname=%s", args->snapdir_name);
if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
inode_init_once(&ci->vfs_inode);
}
-static int default_congestion_kb(void)
-{
- int congestion_kb;
-
- /*
- * Copied from NFS
- *
- * congestion size, scale with available memory.
- *
- * 64MB: 8192k
- * 128MB: 11585k
- * 256MB: 16384k
- * 512MB: 23170k
- * 1GB: 32768k
- * 2GB: 46340k
- * 4GB: 65536k
- * 8GB: 92681k
- * 16GB: 131072k
- *
- * This allows larger machines to have larger/more transfers.
- * Limit the default to 256M
- */
- congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
- if (congestion_kb > 256*1024)
- congestion_kb = 256*1024;
-
- return congestion_kb;
-}
-
static int __init init_caches(void)
{
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
Opt_osd_idle_ttl,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
+ Opt_cap_release_safety,
Opt_readdir_max_entries,
+ Opt_readdir_max_bytes,
Opt_congestion_kb,
Opt_last_int,
/* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_cap_release_safety, "cap_release_safety=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+ {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
- args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
- args->max_readdir = 1024;
+ args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
+ args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+ args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
args->congestion_kb = default_congestion_kb();
/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
case Opt_readdir_max_entries:
args->max_readdir = intval;
break;
+ case Opt_readdir_max_bytes:
+ args->max_readdir_bytes = intval;
+ break;
case Opt_congestion_kb:
args->congestion_kb = intval;
break;
@@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
/*
* true if we have the mon map (and have thus joined the cluster)
*/
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
{
- return client->monc.monmap && client->monc.monmap->epoch;
+ return client->monc.monmap && client->monc.monmap->epoch &&
+ client->osdc.osdmap && client->osdc.osdmap->epoch;
}
/*
@@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
if (err < 0)
goto out;
- while (!have_mon_map(client)) {
+ while (!have_mon_and_osd_map(client)) {
err = -EIO;
if (timeout && time_after_eq(jiffies, started + timeout))
goto out;
@@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
/* wait */
dout("mount waiting for mon_map\n");
err = wait_event_interruptible_timeout(client->auth_wq,
- have_mon_map(client) || (client->auth_err < 0),
- timeout);
+ have_mon_and_osd_map(client) || (client->auth_err < 0),
+ timeout);
if (err == -EINTR || err == -ERESTARTSYS)
goto out;
if (client->auth_err < 0) {
@@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
/*
* construct our own bdi so we can control readahead, etc.
*/
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+
static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
{
int err;
@@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
client->backing_dev_info.ra_pages =
(client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
- err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+ err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+ atomic_long_inc_return(&bdi_seq));
if (!err)
sb->s_bdi = &client->backing_dev_info;
return err;
@@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
goto out;
}
- if (ceph_client(sb) != client) {
+ if (ceph_sb_to_client(sb) != client) {
ceph_destroy_client(client);
- client = ceph_client(sb);
+ client = ceph_sb_to_client(sb);
dout("get_sb got existing client %p\n", client);
} else {
dout("get_sb using new client %p\n", client);
@@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
out_splat:
ceph_mdsc_close_sessions(&client->mdsc);
- up_write(&sb->s_umount);
- deactivate_super(sb);
+ deactivate_locked_super(sb);
goto out_final;
out:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87..3725c9ee9d0 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -52,24 +52,25 @@
struct ceph_mount_args {
int sb_flags;
+ int flags;
+ struct ceph_fsid fsid;
+ struct ceph_entity_addr my_addr;
int num_mon;
struct ceph_entity_addr *mon_addr;
- int flags;
int mount_timeout;
int osd_idle_ttl;
- int caps_wanted_delay_min, caps_wanted_delay_max;
- struct ceph_fsid fsid;
- struct ceph_entity_addr my_addr;
- int wsize;
- int rsize; /* max readahead */
- int max_readdir; /* max readdir size */
- int congestion_kb; /* max readdir size */
int osd_timeout;
int osd_keepalive_timeout;
+ int wsize;
+ int rsize; /* max readahead */
+ int congestion_kb; /* max writeback in flight */
+ int caps_wanted_delay_min, caps_wanted_delay_max;
+ int cap_release_safety;
+ int max_readdir; /* max readdir result (entires) */
+ int max_readdir_bytes; /* max readdir result (bytes) */
char *snapdir_name; /* default ".snap" */
char *name;
char *secret;
- int cap_release_safety;
};
/*
@@ -80,13 +81,14 @@ struct ceph_mount_args {
#define CEPH_OSD_KEEPALIVE_DEFAULT 5
#define CEPH_OSD_IDLE_TTL_DEFAULT 60
#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT 1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
#define CEPH_AUTH_NAME_DEFAULT "guest"
-
/*
* Delay telling the MDS we no longer want caps, in case we reopen
* the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +98,7 @@ struct ceph_mount_args {
#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
/* mount state */
enum {
@@ -160,12 +163,6 @@ struct ceph_client {
#endif
};
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-
/*
* File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read
@@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
extern void ceph_dentry_lru_add(struct dentry *dn);
extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
/*
* our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907f..68aeebc6968 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
static bool ceph_is_valid_xattr(const char *name)
{
- return !strncmp(name, XATTR_SECURITY_PREFIX,
+ return !strncmp(name, "ceph.", 5) ||
+ !strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
}
static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
- { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
- { true, "user.ceph.dir.files", ceph_vxattrcb_files},
- { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
- { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
- { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
- { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
- { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
- { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+ { true, "ceph.dir.entries", ceph_vxattrcb_entries},
+ { true, "ceph.dir.files", ceph_vxattrcb_files},
+ { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+ { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
+ { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+ { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+ { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+ { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
{ true, NULL, NULL }
};
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
}
static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
- { true, "user.ceph.layout", ceph_vxattrcb_layout},
+ { true, "ceph.layout", ceph_vxattrcb_layout},
{ NULL, NULL }
};
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
ci->i_xattrs.names_size -= xattr->name_len;
ci->i_xattrs.vals_size -= xattr->val_len;
}
- if (!xattr) {
- pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
- &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
- xattr->val);
- return -ENOMEM;
- }
ci->i_xattrs.names_size += name_len;
ci->i_xattrs.vals_size += val_len;
if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
ci->i_xattrs.version, ci->i_xattrs.index_version);
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
- (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+ (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto list_xattr;
} else {
spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
const char *value, size_t size, int flags)
{
- struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
return -ENOMEM;
err = -ENOMEM;
for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(GFP_NOFS);
+ pages[i] = __page_cache_alloc(GFP_NOFS);
if (!pages[i]) {
nr_pages = i;
goto out;
@@ -779,7 +774,7 @@ out:
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{
- struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = &client->mdsc;
struct inode *inode = dentry->d_inode;
struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc5..7196077b168 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
+ err = vfs_fsync(host_file, datasync);
if ( !err && !datasync ) {
lock_kernel();
err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a5..d96047b4a63 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
*/
static void prune_dcache(int count)
{
- struct super_block *sb;
+ struct super_block *sb, *n;
int w_count;
int unused = dentry_stat.nr_unused;
int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
if (unused == 0 || count == 0)
return;
spin_lock(&dcache_lock);
-restart:
if (count >= unused)
prune_ratio = 1;
else
prune_ratio = unused / count;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
if (sb->s_nr_dentry_unused == 0)
continue;
sb->s_count++;
@@ -590,14 +591,10 @@ restart:
}
spin_lock(&sb_lock);
count -= pruned;
- /*
- * restart only when sb is no longer on the list and
- * we have more work to do.
- */
- if (__put_super_and_need_restart(sb) && count > 0) {
- spin_unlock(&sb_lock);
- goto restart;
- }
+ __put_super(sb);
+ /* more work left to do? */
+ if (count <= 0)
+ break;
}
spin_unlock(&sb_lock);
spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
spin_lock(&dentry->d_lock);
isdir = S_ISDIR(dentry->d_inode->i_mode);
if (atomic_read(&dentry->d_count) == 1) {
+ dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_iput(dentry);
fsnotify_nameremove(dentry, isdir);
return;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c..8b3ffd5b523 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
s->s_flags |= MS_ACTIVE;
}
- simple_set_mnt(mnt, s);
-
memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
error = mknod_ptmx(s);
if (error)
- goto out_dput;
+ goto out_undo_sget;
- return 0;
+ simple_set_mnt(mnt, s);
-out_dput:
- dput(s->s_root); /* undo dget() in simple_set_mnt() */
+ return 0;
out_undo_sget:
deactivate_locked_super(s);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72..83c4f600786 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
/* A global variable is a bit ugly, but it keeps the code simple */
int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb)
+static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
struct inode *inode, *toput_inode = NULL;
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
iput(toput_inode);
}
-static void drop_pagecache(void)
-{
- struct super_block *sb;
-
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root)
- drop_pagecache_sb(sb);
- up_read(&sb->s_umount);
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
-}
-
static void drop_slab(void)
{
int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
proc_dointvec_minmax(table, write, buffer, length, ppos);
if (write) {
if (sysctl_drop_caches & 1)
- drop_pagecache();
+ iterate_supers(drop_pagecache_sb, NULL);
if (sysctl_drop_caches & 2)
drop_slab();
}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f0..0032a9f5a3a 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
struct page *page_for_lower,
size_t offset_in_page, size_t size);
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
- size_t size);
+int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode);
int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index);
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5eb..3bdddbcc785 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
static int
ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
- return vfs_fsync(ecryptfs_file_to_lower(file),
- ecryptfs_dentry_to_lower(dentry),
- datasync);
+ return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
}
static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affa..65dee2f336a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
static int grow_file(struct dentry *ecryptfs_dentry)
{
struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
- struct file fake_file;
- struct ecryptfs_file_info tmp_file_info;
char zero_virt[] = { 0x00 };
int rc = 0;
- memset(&fake_file, 0, sizeof(fake_file));
- fake_file.f_path.dentry = ecryptfs_dentry;
- memset(&tmp_file_info, 0, sizeof(tmp_file_info));
- ecryptfs_set_file_private(&fake_file, &tmp_file_info);
- ecryptfs_set_file_lower(
- &fake_file,
- ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
- rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
+ rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
i_size_write(ecryptfs_inode, 0);
rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
{
int rc = 0;
struct inode *inode = dentry->d_inode;
- struct dentry *lower_dentry;
- struct file fake_ecryptfs_file;
struct ecryptfs_crypt_stat *crypt_stat;
loff_t i_size = i_size_read(inode);
loff_t lower_size_before_truncate;
@@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
goto out;
}
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
- /* Set up a fake ecryptfs file, this is used to interface with
- * the file in the underlying filesystem so that the
- * truncation has an effect there as well. */
- memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
- fake_ecryptfs_file.f_path.dentry = dentry;
- /* Released at out_free: label */
- ecryptfs_set_file_private(&fake_ecryptfs_file,
- kmem_cache_alloc(ecryptfs_file_info_cache,
- GFP_KERNEL));
- if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
- rc = -ENOMEM;
- goto out;
- }
- lower_dentry = ecryptfs_dentry_to_lower(dentry);
- ecryptfs_set_file_lower(
- &fake_ecryptfs_file,
- ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
/* Switch on growing or shrinking file */
if (ia->ia_size > i_size) {
char zero[] = { 0x00 };
@@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
* this triggers code that will fill in 0's throughout
* the intermediate portion of the previous end of the
* file and the new and of the file */
- rc = ecryptfs_write(&fake_ecryptfs_file, zero,
+ rc = ecryptfs_write(inode, zero,
(ia->ia_size - 1), 1);
} else { /* ia->ia_size < i_size_read(inode) */
/* We're chopping off all the pages down to the page
@@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = vmtruncate(inode, ia->ia_size);
if (rc)
- goto out_free;
+ goto out;
lower_ia->ia_size = ia->ia_size;
lower_ia->ia_valid |= ATTR_SIZE;
- goto out_free;
+ goto out;
}
if (num_zeros) {
char *zeros_virt;
@@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
if (!zeros_virt) {
rc = -ENOMEM;
- goto out_free;
+ goto out;
}
- rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt,
+ rc = ecryptfs_write(inode, zeros_virt,
ia->ia_size, num_zeros);
kfree(zeros_virt);
if (rc) {
printk(KERN_ERR "Error attempting to zero out "
"the remainder of the end page on "
"reducing truncate; rc = [%d]\n", rc);
- goto out_free;
+ goto out;
}
}
vmtruncate(inode, ia->ia_size);
@@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
printk(KERN_ERR "Problem with "
"ecryptfs_write_inode_size_to_metadata; "
"rc = [%d]\n", rc);
- goto out_free;
+ goto out;
}
/* We are reducing the size of the ecryptfs file, and need to
* know if we need to reduce the size of the lower file. */
@@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
} else
lower_ia->ia_valid &= ~ATTR_SIZE;
}
-out_free:
- if (ecryptfs_file_to_private(&fake_ecryptfs_file))
- kmem_cache_free(ecryptfs_file_info_cache,
- ecryptfs_file_to_private(&fake_ecryptfs_file));
out:
return rc;
}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25..cbd4e18adb2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
*
* Returns zero on success; non-zero on error
*/
-static int ecryptfs_parse_options(struct super_block *sb, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
{
char *p;
int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
int fn_cipher_key_bytes;
int fn_cipher_key_bytes_set = 0;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
- &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+ &sbi->mount_crypt_stat;
substring_t args[MAX_OPT_ARGS];
int token;
char *sig_src;
@@ -483,68 +483,7 @@ out:
}
struct kmem_cache *ecryptfs_sb_info_cache;
-
-/**
- * ecryptfs_fill_super
- * @sb: The ecryptfs super block
- * @raw_data: The options passed to mount
- * @silent: Not used but required by function prototype
- *
- * Sets up what we can of the sb, rest is done in ecryptfs_read_super
- *
- * Returns zero on success; non-zero otherwise
- */
-static int
-ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
-{
- struct ecryptfs_sb_info *esi;
- int rc = 0;
-
- /* Released in ecryptfs_put_super() */
- ecryptfs_set_superblock_private(sb,
- kmem_cache_zalloc(ecryptfs_sb_info_cache,
- GFP_KERNEL));
- esi = ecryptfs_superblock_to_private(sb);
- if (!esi) {
- ecryptfs_printk(KERN_WARNING, "Out of memory\n");
- rc = -ENOMEM;
- goto out;
- }
-
- rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
- if (rc)
- goto out;
-
- sb->s_bdi = &esi->bdi;
- sb->s_op = &ecryptfs_sops;
- /* Released through deactivate_super(sb) from get_sb_nodev */
- sb->s_root = d_alloc(NULL, &(const struct qstr) {
- .hash = 0,.name = "/",.len = 1});
- if (!sb->s_root) {
- ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
- rc = -ENOMEM;
- goto out;
- }
- sb->s_root->d_op = &ecryptfs_dops;
- sb->s_root->d_sb = sb;
- sb->s_root->d_parent = sb->s_root;
- /* Released in d_release when dput(sb->s_root) is called */
- /* through deactivate_super(sb) from get_sb_nodev() */
- ecryptfs_set_dentry_private(sb->s_root,
- kmem_cache_zalloc(ecryptfs_dentry_info_cache,
- GFP_KERNEL));
- if (!ecryptfs_dentry_to_private(sb->s_root)) {
- ecryptfs_printk(KERN_ERR,
- "dentry_info_cache alloc failed\n");
- rc = -ENOMEM;
- goto out;
- }
- rc = 0;
-out:
- /* Should be able to rely on deactivate_super called from
- * get_sb_nodev */
- return rc;
-}
+static struct file_system_type ecryptfs_fs_type;
/**
* ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
goto out;
}
+ if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
+ rc = -EINVAL;
+ printk(KERN_ERR "Mount on filesystem of type "
+ "eCryptfs explicitly disallowed due to "
+ "known incompatibilities\n");
+ goto out_free;
+ }
ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
* @dev_name: The path to mount over
* @raw_data: The options passed into the kernel
*
- * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * The whole ecryptfs_get_sb process is broken into 3 functions:
* ecryptfs_parse_options(): handle options passed to ecryptfs, if any
- * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
- * with as much information as it can before needing
- * the lower filesystem.
* ecryptfs_read_super(): this accesses the lower filesystem and uses
* ecryptfs_interpose to perform most of the linking
* ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
const char *dev_name, void *raw_data,
struct vfsmount *mnt)
{
+ struct super_block *s;
+ struct ecryptfs_sb_info *sbi;
+ struct ecryptfs_dentry_info *root_info;
+ const char *err = "Getting sb failed";
int rc;
- struct super_block *sb;
- rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
- if (rc < 0) {
- printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+ sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
+ if (!sbi) {
+ rc = -ENOMEM;
goto out;
}
- sb = mnt->mnt_sb;
- rc = ecryptfs_parse_options(sb, raw_data);
+
+ rc = ecryptfs_parse_options(sbi, raw_data);
if (rc) {
- printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
- goto out_abort;
+ err = "Error parsing options";
+ goto out;
+ }
+
+ s = sget(fs_type, NULL, set_anon_super, NULL);
+ if (IS_ERR(s)) {
+ rc = PTR_ERR(s);
+ goto out;
}
- rc = ecryptfs_read_super(sb, dev_name);
+
+ s->s_flags = flags;
+ rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
if (rc) {
- printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
- goto out_abort;
+ deactivate_locked_super(s);
+ goto out;
}
- goto out;
-out_abort:
- dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */
- deactivate_locked_super(sb);
+
+ ecryptfs_set_superblock_private(s, sbi);
+ s->s_bdi = &sbi->bdi;
+
+ /* ->kill_sb() will take care of sbi after that point */
+ sbi = NULL;
+ s->s_op = &ecryptfs_sops;
+
+ rc = -ENOMEM;
+ s->s_root = d_alloc(NULL, &(const struct qstr) {
+ .hash = 0,.name = "/",.len = 1});
+ if (!s->s_root) {
+ deactivate_locked_super(s);
+ goto out;
+ }
+ s->s_root->d_op = &ecryptfs_dops;
+ s->s_root->d_sb = s;
+ s->s_root->d_parent = s->s_root;
+
+ root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
+ if (!root_info) {
+ deactivate_locked_super(s);
+ goto out;
+ }
+ /* ->kill_sb() will take care of root_info */
+ ecryptfs_set_dentry_private(s->s_root, root_info);
+ s->s_flags |= MS_ACTIVE;
+ rc = ecryptfs_read_super(s, dev_name);
+ if (rc) {
+ deactivate_locked_super(s);
+ err = "Reading sb failed";
+ goto out;
+ }
+ simple_set_mnt(mnt, s);
+ return 0;
+
out:
+ if (sbi) {
+ ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
+ kmem_cache_free(ecryptfs_sb_info_cache, sbi);
+ }
+ printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
return rc;
}
@@ -633,11 +624,16 @@ out:
* @sb: The ecryptfs super block
*
* Used to bring the superblock down and free the private data.
- * Private data is free'd in ecryptfs_put_super()
*/
static void ecryptfs_kill_block_super(struct super_block *sb)
{
- generic_shutdown_super(sb);
+ struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+ kill_anon_super(sb);
+ if (!sb_info)
+ return;
+ ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+ bdi_destroy(&sb_info->bdi);
+ kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68..b1d82756544 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
* Returns locked and up-to-date page (if ok), with increased
* refcnt.
*/
-struct page *ecryptfs_get_locked_page(struct file *file, loff_t index)
+struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
{
- struct dentry *dentry;
- struct inode *inode;
- struct address_space *mapping;
- struct page *page;
-
- dentry = file->f_path.dentry;
- inode = dentry->d_inode;
- mapping = inode->i_mapping;
- page = read_mapping_page(mapping, index, (void *)file);
+ struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
if (!IS_ERR(page))
lock_page(page);
return page;
@@ -198,7 +190,7 @@ out:
static int ecryptfs_readpage(struct file *file, struct page *page)
{
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
int rc = 0;
if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
if (!PageUptodate(page)) {
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(
- file->f_path.dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
|| (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
unsigned to = from + copied;
struct inode *ecryptfs_inode = mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat;
+ &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
int rc;
if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd655..db184ef15d3 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
/**
* ecryptfs_write
- * @ecryptfs_file: The eCryptfs file into which to write
+ * @ecryptfs_inode: The eCryptfs file into which to write
* @data: Virtual address where data to write is located
* @offset: Offset in the eCryptfs file at which to begin writing the
* data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
+int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
size_t size)
{
struct page *ecryptfs_page;
struct ecryptfs_crypt_stat *crypt_stat;
- struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
char *ecryptfs_page_virt;
loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
}
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+ ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
ecryptfs_page_idx);
if (IS_ERR(ecryptfs_page)) {
rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
int ecryptfs_read(char *data, loff_t offset, size_t size,
struct file *ecryptfs_file)
{
+ struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
struct page *ecryptfs_page;
char *ecryptfs_page_virt;
- loff_t ecryptfs_file_size =
- i_size_read(ecryptfs_file->f_dentry->d_inode);
+ loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
loff_t data_offset = 0;
loff_t pos;
int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
if (num_bytes > total_remaining_bytes)
num_bytes = total_remaining_bytes;
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file,
+ ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
ecryptfs_page_idx);
if (IS_ERR(ecryptfs_page)) {
rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d23..0435886e4a9 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
}
/**
- * ecryptfs_put_super
- * @sb: Pointer to the ecryptfs super block
- *
- * Final actions when unmounting a file system.
- * This will handle deallocation and release of our private data.
- */
-static void ecryptfs_put_super(struct super_block *sb)
-{
- struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
-
- lock_kernel();
-
- ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
- bdi_destroy(&sb_info->bdi);
- kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
- ecryptfs_set_superblock_private(sb, NULL);
-
- unlock_kernel();
-}
-
-/**
* ecryptfs_statfs
* @sb: The ecryptfs super block
* @buf: The struct kstatfs to fill in with stats
@@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
.alloc_inode = ecryptfs_alloc_inode,
.destroy_inode = ecryptfs_destroy_inode,
.drop_inode = generic_delete_inode,
- .put_super = ecryptfs_put_super,
.statfs = ecryptfs_statfs,
.remount_fs = NULL,
.clear_inode = ecryptfs_clear_inode,
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93..d7c6afa7975 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1123,16 +1123,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
sbi = sb->s_fs_info;
sb->s_dirt = 1;
- inode->i_uid = current->cred->fsuid;
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else {
- inode->i_gid = current->cred->fsgid;
- }
- inode->i_mode = mode;
-
+ inode_init_owner(inode, dir, mode);
inode->i_ino = sbi->s_nextid++;
inode->i_blkbits = EXOFS_BLKSHIFT;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3..ca7e2a0ed98 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
return error;
}
-struct xattr_handler ext2_xattr_acl_access_handler = {
+const struct xattr_handler ext2_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
.set = ext2_xattr_set_acl,
};
-struct xattr_handler ext2_xattr_acl_default_handler = {
+const struct xattr_handler ext2_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f0c5286f934..938dbc739d0 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -549,16 +549,12 @@ got:
sb->s_dirt = 1;
mark_buffer_dirty(bh2);
- inode->i_uid = current_fsuid();
- if (test_opt (sb, GRPID))
+ if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
} else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 3b96045a00c..7c3915780b1 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
static struct mb_cache *ext2_xattr_cache;
-static struct xattr_handler *ext2_xattr_handler_map[] = {
+static const struct xattr_handler *ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
#endif
};
-struct xattr_handler *ext2_xattr_handlers[] = {
+const struct xattr_handler *ext2_xattr_handlers[] = {
&ext2_xattr_user_handler,
&ext2_xattr_trusted_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
NULL
};
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
ext2_xattr_handler(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
/* list the attribute names */
for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
entry = EXT2_XATTR_NEXT(entry)) {
- struct xattr_handler *handler =
+ const struct xattr_handler *handler =
ext2_xattr_handler(entry->e_name_index);
if (handler) {
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced..a1a1c218461 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
# ifdef CONFIG_EXT2_FS_XATTR
-extern struct xattr_handler ext2_xattr_user_handler;
-extern struct xattr_handler ext2_xattr_trusted_handler;
-extern struct xattr_handler ext2_xattr_acl_access_handler;
-extern struct xattr_handler ext2_xattr_acl_default_handler;
-extern struct xattr_handler ext2_xattr_security_handler;
+extern const struct xattr_handler ext2_xattr_user_handler;
+extern const struct xattr_handler ext2_xattr_trusted_handler;
+extern const struct xattr_handler ext2_xattr_acl_access_handler;
+extern const struct xattr_handler ext2_xattr_acl_default_handler;
+extern const struct xattr_handler ext2_xattr_security_handler;
extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
extern int init_ext2_xattr(void);
extern void exit_ext2_xattr(void);
-extern struct xattr_handler *ext2_xattr_handlers[];
+extern const struct xattr_handler *ext2_xattr_handlers[];
# else /* CONFIG_EXT2_FS_XATTR */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6..3004e15d5da 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
return err;
}
-struct xattr_handler ext2_xattr_security_handler = {
+const struct xattr_handler ext2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext2_xattr_security_list,
.get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f477..667e46a8d62 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-struct xattr_handler ext2_xattr_trusted_handler = {
+const struct xattr_handler ext2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext2_xattr_trusted_list,
.get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b..099d20f4716 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ext2_xattr_user_handler = {
+const struct xattr_handler ext2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext2_xattr_user_list,
.get = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba3415866..01552abbca3 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
return error;
}
-struct xattr_handler ext3_xattr_acl_access_handler = {
+const struct xattr_handler ext3_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
.set = ext3_xattr_set_acl,
};
-struct xattr_handler ext3_xattr_acl_default_handler = {
+const struct xattr_handler ext3_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 26289e8f416..fcf7487734b 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
* storage
*/
if (needs_barrier)
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
return ret;
}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff..498021eb88f 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- inode->i_uid = current_fsuid();
- if (test_opt (sb, GRPID))
- inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
+
+ if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
} else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
/* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a93..71fb8d65e54 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
static struct mb_cache *ext3_xattr_cache;
-static struct xattr_handler *ext3_xattr_handler_map[] = {
+static const struct xattr_handler *ext3_xattr_handler_map[] = {
[EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
#endif
};
-struct xattr_handler *ext3_xattr_handlers[] = {
+const struct xattr_handler *ext3_xattr_handlers[] = {
&ext3_xattr_user_handler,
&ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
NULL
};
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
ext3_xattr_handler(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
size_t rest = buffer_size;
for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
- struct xattr_handler *handler =
+ const struct xattr_handler *handler =
ext3_xattr_handler(entry->e_name_index);
if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82a..377fe720116 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
# ifdef CONFIG_EXT3_FS_XATTR
-extern struct xattr_handler ext3_xattr_user_handler;
-extern struct xattr_handler ext3_xattr_trusted_handler;
-extern struct xattr_handler ext3_xattr_acl_access_handler;
-extern struct xattr_handler ext3_xattr_acl_default_handler;
-extern struct xattr_handler ext3_xattr_security_handler;
+extern const struct xattr_handler ext3_xattr_user_handler;
+extern const struct xattr_handler ext3_xattr_trusted_handler;
+extern const struct xattr_handler ext3_xattr_acl_access_handler;
+extern const struct xattr_handler ext3_xattr_acl_default_handler;
+extern const struct xattr_handler ext3_xattr_security_handler;
extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
extern int init_ext3_xattr(void);
extern void exit_ext3_xattr(void);
-extern struct xattr_handler *ext3_xattr_handlers[];
+extern const struct xattr_handler *ext3_xattr_handlers[];
# else /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476df..03a99bfc59f 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
return err;
}
-struct xattr_handler ext3_xattr_security_handler = {
+const struct xattr_handler ext3_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext3_xattr_security_list,
.get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed9..dc8edda9ffe 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-struct xattr_handler ext3_xattr_trusted_handler = {
+const struct xattr_handler ext3_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext3_xattr_trusted_list,
.get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a6..7a321974d58 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ext3_xattr_user_handler = {
+const struct xattr_handler ext3_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext3_xattr_user_list,
.get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6..feaf498feaa 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
return error;
}
-struct xattr_handler ext4_xattr_acl_access_handler = {
+const struct xattr_handler ext4_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
.set = ext4_xattr_set_acl,
};
-struct xattr_handler ext4_xattr_acl_default_handler = {
+const struct xattr_handler ext4_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1c..ef3d980e67c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ext4_should_writeback_data(inode) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+ NULL, BLKDEV_IFL_WAIT);
jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd..1a0e183a2f0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -979,16 +979,12 @@ got:
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
- inode->i_uid = current_fsuid();
- if (test_opt(sb, GRPID))
+ if (test_opt(sb, GRPID)) {
+ inode->i_mode = mode;
+ inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
- else if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
} else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d..2de0e951508 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
static struct mb_cache *ext4_xattr_cache;
-static struct xattr_handler *ext4_xattr_handler_map[] = {
+static const struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
#endif
};
-struct xattr_handler *ext4_xattr_handlers[] = {
+const struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
-static inline struct xattr_handler *
+static inline const struct xattr_handler *
ext4_xattr_handler(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
handler = ext4_xattr_handler_map[name_index];
@@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
size_t rest = buffer_size;
for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
- struct xattr_handler *handler =
+ const struct xattr_handler *handler =
ext4_xattr_handler(entry->e_name_index);
if (handler) {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c2..518e96e4390 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
# ifdef CONFIG_EXT4_FS_XATTR
-extern struct xattr_handler ext4_xattr_user_handler;
-extern struct xattr_handler ext4_xattr_trusted_handler;
-extern struct xattr_handler ext4_xattr_acl_access_handler;
-extern struct xattr_handler ext4_xattr_acl_default_handler;
-extern struct xattr_handler ext4_xattr_security_handler;
+extern const struct xattr_handler ext4_xattr_user_handler;
+extern const struct xattr_handler ext4_xattr_trusted_handler;
+extern const struct xattr_handler ext4_xattr_acl_access_handler;
+extern const struct xattr_handler ext4_xattr_acl_default_handler;
+extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
extern int init_ext4_xattr(void);
extern void exit_ext4_xattr(void);
-extern struct xattr_handler *ext4_xattr_handlers[];
+extern const struct xattr_handler *ext4_xattr_handlers[];
# else /* CONFIG_EXT4_FS_XATTR */
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df0..9b21268e121 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
return err;
}
-struct xattr_handler ext4_xattr_security_handler = {
+const struct xattr_handler ext4_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ext4_xattr_security_list,
.get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc658..37e6ebca2cc 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ext4_xattr_trusted_handler = {
+const struct xattr_handler ext4_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ext4_xattr_trusted_list,
.get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce..98c375352d0 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ext4_xattr_user_handler = {
+const struct xattr_handler ext4_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ext4_xattr_user_list,
.get = ext4_xattr_user_get,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0a140741b39..f74d270ba15 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
@@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_NOTIFY:
err = fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETPIPE_SZ:
+ case F_GETPIPE_SZ:
+ err = pipe_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4d..5c4161f1fd9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,9 +42,10 @@ struct wb_writeback_args {
long nr_pages;
struct super_block *sb;
enum writeback_sync_modes sync_mode;
- int for_kupdate:1;
- int range_cyclic:1;
- int for_background:1;
+ unsigned int for_kupdate:1;
+ unsigned int range_cyclic:1;
+ unsigned int for_background:1;
+ unsigned int sb_pinned:1;
};
/*
@@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
}
static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_args *args)
+ struct wb_writeback_args *args,
+ int wait)
{
struct bdi_work *work;
@@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
if (work) {
bdi_work_init(work, args);
bdi_queue_work(bdi, work);
+ if (wait)
+ bdi_wait_on_work_clear(work);
} else {
struct bdi_writeback *wb = &bdi->wb;
@@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
+ /*
+ * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+ * lets make it explicitly clear.
+ */
+ .sb_pinned = 1,
};
struct bdi_work work;
@@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
* @bdi: the backing device to write from
* @sb: write inodes from this super_block
* @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
*
* Description:
* This does WB_SYNC_NONE opportunistic writeback. The IO is only
* started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * completion. Caller specifies whether sb umount sem is held already or not.
*
*/
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages)
+ long nr_pages, int sb_locked)
{
struct wb_writeback_args args = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
.nr_pages = nr_pages,
.range_cyclic = 1,
+ .sb_pinned = sb_locked,
};
/*
@@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
args.for_background = 1;
}
- bdi_alloc_queue_work(bdi, &args);
+ bdi_alloc_queue_work(bdi, &args, sb_locked);
}
/*
@@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
BUG_ON(inode->i_state & I_SYNC);
- /* Set I_SYNC, reset I_DIRTY */
- dirty = inode->i_state & I_DIRTY;
+ /* Set I_SYNC, reset I_DIRTY_PAGES */
inode->i_state |= I_SYNC;
- inode->i_state &= ~I_DIRTY;
-
+ inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode_lock);
ret = do_writepages(mapping, wbc);
@@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
ret = err;
}
+ /*
+ * Some filesystems may redirty the inode during the writeback
+ * due to delalloc, clear dirty metadata flags right before
+ * write_inode()
+ */
+ spin_lock(&inode_lock);
+ dirty = inode->i_state & I_DIRTY;
+ inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ spin_unlock(&inode_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
@@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
/*
* Caller must already hold the ref for this
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
WARN_ON(!rwsem_is_locked(&sb->s_umount));
return SB_NOT_PINNED;
}
@@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb,
.for_kupdate = args->for_kupdate,
.for_background = args->for_background,
.range_cyclic = args->range_cyclic,
+ .sb_pinned = args->sb_pinned,
};
unsigned long oldest_jif;
long wrote = 0;
@@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
unsigned long expired;
long nr_pages;
+ /*
+ * When set to zero, disable periodic writeback
+ */
+ if (!dirty_writeback_interval)
+ return 0;
+
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
@@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
while ((work = get_next_work_item(bdi, wb)) != NULL) {
struct wb_writeback_args args = work->args;
+ int post_clear;
/*
* Override sync mode, in case we must wait for completion
@@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
if (force_wait)
work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+ post_clear = WB_SYNC_ALL || args.sb_pinned;
+
/*
* If this isn't a data integrity operation, just notify
* that we have seen this work and we are now starting it.
*/
- if (args.sync_mode == WB_SYNC_NONE)
+ if (!post_clear)
wb_clear_pending(wb, work);
wrote += wb_writeback(wb, &args);
@@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
* This is a data integrity writeback, so only do the
* notification when we have completed the work.
*/
- if (args.sync_mode == WB_SYNC_ALL)
+ if (post_clear)
wb_clear_pending(wb, work);
}
@@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
break;
}
- wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout_interruptible(wait_jiffies);
+ if (dirty_writeback_interval) {
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ } else {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty_careful(&wb->bdi->work_list) &&
+ !kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+
try_to_freeze();
}
@@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
if (!bdi_has_dirty_io(bdi))
continue;
- bdi_alloc_queue_work(bdi, &args);
+ bdi_alloc_queue_work(bdi, &args, 0);
}
rcu_read_unlock();
@@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
}
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ long nr_to_write;
+
+ nr_to_write = nr_dirty + nr_unstable +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
/**
* writeback_inodes_sb - writeback dirty inodes from given super_block
* @sb: the superblock
@@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb)
*/
void writeback_inodes_sb(struct super_block *sb)
{
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
- long nr_to_write;
-
- nr_to_write = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
- bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+ __writeback_inodes_sb(sb, 0);
}
EXPORT_SYMBOL(writeback_inodes_sb);
/**
+ * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+ __writeback_inodes_sb(sb, 1);
+}
+
+/**
* writeback_inodes_sb_if_idle - start writeback if none underway
* @sb: the superblock
*
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df545765..99800e56415 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-struct xattr_handler generic_acl_access_handler = {
+const struct xattr_handler generic_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = generic_acl_list,
@@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
.set = generic_acl_set,
};
-struct xattr_handler generic_acl_default_handler = {
+const struct xattr_handler generic_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c2..9fb76b0a048 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -335,7 +335,7 @@ out:
return error;
}
-struct xattr_handler gfs2_xattr_system_handler = {
+const struct xattr_handler gfs2_xattr_system_handler = {
.prefix = XATTR_SYSTEM_PREFIX,
.flags = GFS2_EATYPE_SYS,
.get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620..b522b0cb39e 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
extern int gfs2_check_acl(struct inode *inode, int mask);
extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern struct xattr_handler gfs2_xattr_system_handler;
+extern const struct xattr_handler gfs2_xattr_system_handler;
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8bce73ed4d8..117fa4171f6 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
nr_sects, GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT |
+ BLKDEV_IFL_BARRIER);
if (rv)
goto fail;
nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
}
if (nr_sects) {
rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (rv)
goto fail;
}
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e..a0464680af0 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
extern const struct export_operations gfs2_export_ops;
extern const struct super_operations gfs2_super_ops;
extern const struct dentry_operations gfs2_dops;
-extern struct xattr_handler *gfs2_xattr_handlers[];
+extern const struct xattr_handler *gfs2_xattr_handlers[];
#endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d..82f93da00d1 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
return error;
}
-static struct xattr_handler gfs2_xattr_user_handler = {
+static const struct xattr_handler gfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = GFS2_EATYPE_USR,
.get = gfs2_xattr_get,
.set = gfs2_xattr_set,
};
-static struct xattr_handler gfs2_xattr_security_handler = {
+static const struct xattr_handler gfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = GFS2_EATYPE_SECURITY,
.get = gfs2_xattr_get,
.set = gfs2_xattr_set,
};
-struct xattr_handler *gfs2_xattr_handlers[] = {
+const struct xattr_handler *gfs2_xattr_handlers[] = {
&gfs2_xattr_user_handler,
&gfs2_xattr_security_handler,
&gfs2_xattr_system_handler,
diff --git a/fs/inode.c b/fs/inode.c
index 258ec22bb29..2bee20ae3d6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
*/
void __iget(struct inode *inode)
{
- if (atomic_read(&inode->i_count)) {
- atomic_inc(&inode->i_count);
+ if (atomic_inc_return(&inode->i_count) != 1)
return;
- }
- atomic_inc(&inode->i_count);
+
if (!(inode->i_state & (I_DIRTY|I_SYNC)))
list_move(&inode->i_list, &inode_in_use);
inodes_stat.nr_unused--;
@@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);
+
+/**
+ * Init uid,gid,mode for new inode according to posix standards
+ * @inode: New inode
+ * @dir: Directory inode
+ * @mode: mode of the new inode
+ */
+void inode_init_owner(struct inode *inode, const struct inode *dir,
+ mode_t mode)
+{
+ inode->i_uid = current_fsuid();
+ if (dir && dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+}
+EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bd..6b706bc60a6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
* super.c
*/
extern int do_remount_sb(struct super_block *, int, void *, int);
+extern void __put_super(struct super_block *sb);
+extern void put_super(struct super_block *sb);
/*
* open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da93..2d140a71386 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
if (sb->s_op->freeze_fs == NULL)
return -EOPNOTSUPP;
- /* If a blockdevice-backed filesystem isn't specified, return. */
- if (sb->s_bdev == NULL)
- return -EINVAL;
-
/* Freeze */
- sb = freeze_bdev(sb->s_bdev);
- if (IS_ERR(sb))
- return PTR_ERR(sb);
- return 0;
+ return freeze_super(sb);
}
static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
- if (sb->s_bdev == NULL)
- return -EINVAL;
-
/* Thaw */
- return thaw_bdev(sb->s_bdev, sb);
+ return thaw_super(sb);
}
/*
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef92..076d1cc44f9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
*/
if ((journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
if (!(journal->j_flags & JBD2_ABORT))
jbd2_journal_update_superblock(journal, 1);
return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ff..75716d3d2be 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
if (commit_transaction->t_flushed_data_blocks &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
if (err)
__jbd2_journal_abort_hard(journal);
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_dev, NULL);
+ blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
}
err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476..a33aab6b5e6 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
return rc;
}
-struct xattr_handler jffs2_acl_access_xattr_handler = {
+const struct xattr_handler jffs2_acl_access_xattr_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_DEFAULT,
.list = jffs2_acl_access_listxattr,
@@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
.set = jffs2_acl_setxattr,
};
-struct xattr_handler jffs2_acl_default_xattr_handler = {
+const struct xattr_handler jffs2_acl_default_xattr_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36..5e42de8d954 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
extern int jffs2_init_acl_post(struct inode *);
-extern struct xattr_handler jffs2_acl_access_xattr_handler;
-extern struct xattr_handler jffs2_acl_default_xattr_handler;
+extern const struct xattr_handler jffs2_acl_access_xattr_handler;
+extern const struct xattr_handler jffs2_acl_default_xattr_handler;
#else
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee05858..239f51216a6 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
return retlen;
}
-struct xattr_handler jffs2_security_xattr_handler = {
+const struct xattr_handler jffs2_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = jffs2_security_listxattr,
.set = jffs2_security_setxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d..a2d58c96f1b 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
* do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
* is an implementation of setxattr handler on jffs2.
* -------------------------------------------------- */
-struct xattr_handler *jffs2_xattr_handlers[] = {
+const struct xattr_handler *jffs2_xattr_handlers[] = {
&jffs2_user_xattr_handler,
#ifdef CONFIG_JFFS2_FS_SECURITY
&jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
NULL
};
-static struct xattr_handler *xprefix_to_handler(int xprefix) {
- struct xattr_handler *ret;
+static const struct xattr_handler *xprefix_to_handler(int xprefix) {
+ const struct xattr_handler *ret;
switch (xprefix) {
case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct jffs2_inode_cache *ic = f->inocache;
struct jffs2_xattr_ref *ref, **pref;
struct jffs2_xattr_datum *xd;
- struct xattr_handler *xhandle;
+ const struct xattr_handler *xhandle;
ssize_t len, rc;
int retry = 0;
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7a..cf4f5759b42 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
const char *buffer, size_t size, int flags);
-extern struct xattr_handler *jffs2_xattr_handlers[];
-extern struct xattr_handler jffs2_user_xattr_handler;
-extern struct xattr_handler jffs2_trusted_xattr_handler;
+extern const struct xattr_handler *jffs2_xattr_handlers[];
+extern const struct xattr_handler jffs2_user_xattr_handler;
+extern const struct xattr_handler jffs2_trusted_xattr_handler;
extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
#ifdef CONFIG_JFFS2_FS_SECURITY
extern int jffs2_init_security(struct inode *inode, struct inode *dir);
-extern struct xattr_handler jffs2_security_xattr_handler;
+extern const struct xattr_handler jffs2_security_xattr_handler;
#else
#define jffs2_init_security(inode,dir) (0)
#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e0..1c868194c50 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
return retlen;
}
-struct xattr_handler jffs2_trusted_xattr_handler = {
+const struct xattr_handler jffs2_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = jffs2_trusted_listxattr,
.set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dff..916b5c96603 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
return retlen;
}
-struct xattr_handler jffs2_user_xattr_handler = {
+const struct xattr_handler jffs2_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.list = jffs2_user_listxattr,
.set = jffs2_user_setxattr,
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b6776..2686531e235 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
goto fail_unlock;
}
- inode->i_uid = current_fsuid();
- if (parent->i_mode & S_ISGID) {
- inode->i_gid = parent->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
+ inode_init_owner(inode, parent, mode);
/*
* New inodes need to save sane values on disk when
* uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
if (rc)
goto fail_drop;
- inode->i_mode = mode;
/* inherit flags from parent */
jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
if (S_ISLNK(mode))
jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
}
- jfs_inode->mode2 |= mode;
+ jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 755a92e8daa..f602e230e16 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
inode->i_mode = mode;
logfs_set_ino_generation(sb, inode);
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
-
+ inode_init_owner(inode, dir, mode);
logfs_inode_setops(inode);
insert_inode_hash(inode);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae4..482779fe4e7 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
clear_inode(inode); /* clear in-memory copy */
}
-struct inode * minix_new_inode(const struct inode * dir, int * error)
+struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
{
struct super_block *sb = dir->i_sb;
struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
iput(inode);
return NULL;
}
- inode->i_uid = current_fsuid();
- inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
+ inode_init_owner(inode, dir, mode);
inode->i_ino = j;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
inode->i_blocks = 0;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b4211..111f34ee9e3 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
extern struct inode *minix_iget(struct super_block *, unsigned long);
extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
-extern struct inode * minix_new_inode(const struct inode * dir, int * error);
+extern struct inode * minix_new_inode(const struct inode *, int, int *);
extern void minix_free_inode(struct inode * inode);
extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd612..e20ee85955d 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
if (!old_valid_dev(rdev))
return -EINVAL;
- inode = minix_new_inode(dir, &error);
+ inode = minix_new_inode(dir, mode, &error);
if (inode) {
- inode->i_mode = mode;
minix_set_inode(inode, rdev);
mark_inode_dirty(inode);
error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
if (i > dir->i_sb->s_blocksize)
goto out;
- inode = minix_new_inode(dir, &err);
+ inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
if (!inode)
goto out;
- inode->i_mode = S_IFLNK | 0777;
minix_set_inode(inode, 0);
err = page_symlink(inode, symname, i);
if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
inode_inc_link_count(dir);
- inode = minix_new_inode(dir, &err);
+ inode = minix_new_inode(dir, mode, &err);
if (!inode)
goto out_dir;
- inode->i_mode = S_IFDIR | mode;
- if (dir->i_mode & S_ISGID)
- inode->i_mode |= S_ISGID;
minix_set_inode(inode, 0);
inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc..48e1f60520e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
{
dput(nd->path.dentry);
- if (nd->path.mnt != path->mnt)
+ if (nd->path.mnt != path->mnt) {
mntput(nd->path.mnt);
- nd->path.mnt = path->mnt;
+ nd->path.mnt = path->mnt;
+ }
nd->path.dentry = path->dentry;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4..7e26caab2a2 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
/* Globals */
-static struct path rec_dir;
-static int rec_dir_init = 0;
+static struct file *rec_file;
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
return status;
}
-static void
-nfsd4_sync_rec_dir(void)
-{
- vfs_fsync(NULL, rec_dir.dentry, 0);
-}
-
int
nfsd4_create_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
char *dname = clp->cl_recdir;
- struct dentry *dentry;
+ struct dentry *dir, *dentry;
int status;
dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
- if (!rec_dir_init || clp->cl_firststate)
+ if (!rec_file || clp->cl_firststate)
return 0;
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
+ dir = rec_file->f_path.dentry;
/* lock the parent */
- mutex_lock(&rec_dir.dentry->d_inode->i_mutex);
+ mutex_lock(&dir->d_inode->i_mutex);
- dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+ dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
goto out_put;
}
- status = mnt_want_write(rec_dir.mnt);
+ status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out_put;
- status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
- mnt_drop_write(rec_dir.mnt);
+ status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
+ mnt_drop_write(rec_file->f_path.mnt);
out_put:
dput(dentry);
out_unlock:
- mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+ mutex_unlock(&dir->d_inode->i_mutex);
if (status == 0) {
clp->cl_firststate = 1;
- nfsd4_sync_rec_dir();
+ vfs_fsync(rec_file, 0);
}
nfs4_reset_creds(original_cred);
dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
struct dentry *dentry;
int status;
- if (!rec_dir_init)
+ if (!rec_file)
return 0;
status = nfs4_save_creds(&original_cred);
if (status < 0)
return status;
- filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
+ filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
current_cred());
status = PTR_ERR(filp);
if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
static int
nfsd4_unlink_clid_dir(char *name, int namlen)
{
- struct dentry *dentry;
+ struct dentry *dir, *dentry;
int status;
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
- mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
- dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+ dir = rec_file->f_path.dentry;
+ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
status = -ENOENT;
if (!dentry->d_inode)
goto out;
- status = vfs_rmdir(rec_dir.dentry->d_inode, dentry);
+ status = vfs_rmdir(dir->d_inode, dentry);
out:
dput(dentry);
out_unlock:
- mutex_unlock(&rec_dir.dentry->d_inode->i_mutex);
+ mutex_unlock(&dir->d_inode->i_mutex);
return status;
}
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
const struct cred *original_cred;
int status;
- if (!rec_dir_init || !clp->cl_firststate)
+ if (!rec_file || !clp->cl_firststate)
return;
- status = mnt_want_write(rec_dir.mnt);
+ status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out;
clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
nfs4_reset_creds(original_cred);
if (status == 0)
- nfsd4_sync_rec_dir();
- mnt_drop_write(rec_dir.mnt);
+ vfs_fsync(rec_file, 0);
+ mnt_drop_write(rec_file->f_path.mnt);
out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
nfsd4_recdir_purge_old(void) {
int status;
- if (!rec_dir_init)
+ if (!rec_file)
return;
- status = mnt_want_write(rec_dir.mnt);
+ status = mnt_want_write(rec_file->f_path.mnt);
if (status)
goto out;
- status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+ status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
if (status == 0)
- nfsd4_sync_rec_dir();
- mnt_drop_write(rec_dir.mnt);
+ vfs_fsync(rec_file, 0);
+ mnt_drop_write(rec_file->f_path.mnt);
out:
if (status)
printk("nfsd4: failed to purge old clients from recovery"
- " directory %s\n", rec_dir.dentry->d_name.name);
+ " directory %s\n", rec_file->f_path.dentry->d_name.name);
}
static int
@@ -355,10 +350,13 @@ int
nfsd4_recdir_load(void) {
int status;
- status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+ if (!rec_file)
+ return 0;
+
+ status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
if (status)
printk("nfsd4: failed loading clients from recovery"
- " directory %s\n", rec_dir.dentry->d_name.name);
+ " directory %s\n", rec_file->f_path.dentry->d_name.name);
return status;
}
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
rec_dirname);
- BUG_ON(rec_dir_init);
+ BUG_ON(rec_file);
status = nfs4_save_creds(&original_cred);
if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
return;
}
- status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
- &rec_dir);
- if (status)
+ rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
+ if (IS_ERR(rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
rec_dirname);
+ rec_file = NULL;
+ }
- if (!status)
- rec_dir_init = 1;
nfs4_reset_creds(original_cred);
}
void
nfsd4_shutdown_recdir(void)
{
- if (!rec_dir_init)
+ if (!rec_file)
return;
- rec_dir_init = 0;
- path_put(&rec_dir);
+ fput(rec_file);
+ rec_file = NULL;
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23c06f77f4c..ebbf3b6b245 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file)
if (inode->i_state & I_DIRTY) {
dprintk("nfsd: write sync %d\n", task_pid_nr(current));
- err = vfs_fsync(file, file->f_path.dentry, 0);
+ err = vfs_fsync(file, 0);
}
last_ino = inode->i_ino;
last_dev = inode->i_sb->s_dev;
@@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err)
goto out;
if (EX_ISSYNC(fhp->fh_export)) {
- int err2 = vfs_fsync_range(file, file->f_path.dentry,
- offset, end, 0);
+ int err2 = vfs_fsync_range(file, offset, end, 0);
if (err2 != -EINVAL)
err = nfserrno(err2);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5e226d4b41d..39e038ac8fc 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
atomic_inc(&sbi->s_inodes_count);
-
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a756168a21c..8c1097327ab 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
start * sects_per_block,
nblocks * sects_per_block,
GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_BARRIER);
if (ret < 0)
return ret;
nblocks = 0;
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, DISCARD_FL_BARRIER);
+ GFP_NOFS, BLKDEV_IFL_BARRIER);
return ret;
}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914cc..27b75ebc746 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
int pin_inotify_watch(struct inotify_watch *watch)
{
struct super_block *sb = watch->inode->i_sb;
- spin_lock(&sb_lock);
- if (sb->s_count >= S_BIAS) {
- atomic_inc(&sb->s_active);
- spin_unlock(&sb_lock);
+ if (atomic_inc_not_zero(&sb->s_active)) {
atomic_inc(&watch->count);
return 1;
}
- spin_unlock(&sb_lock);
return 0;
}
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
* done. Cleanup is just deactivate_super(). However, that leaves a messy
* case - what if we *are* racing with umount() and active references to
* superblock can't be acquired anymore? We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
- * down and the watch in question is pining for fjords. That's fine, but
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount. We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes(). So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
- * could grab ->s_umount. So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer. If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address. That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that. Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check. If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
+ * ->s_umount, which will wait until the superblock is shut down and the
+ * watch in question is pining for fjords.
*
* And yes, this is far beyond mere "not very pretty"; so's the entire
* concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
* Called with ih->mutex held, drops it. Possible return values:
* 0 - nothing to do, it has died
* 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
*/
static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
{
struct super_block *sb = watch->inode->i_sb;
- s32 wd = watch->wd;
- spin_lock(&sb_lock);
- if (sb->s_count >= S_BIAS) {
- atomic_inc(&sb->s_active);
- spin_unlock(&sb_lock);
+ if (atomic_inc_not_zero(&sb->s_active)) {
get_inotify_watch(watch);
mutex_unlock(&ih->mutex);
return 1; /* the best outcome */
}
+ spin_lock(&sb_lock);
sb->s_count++;
spin_unlock(&sb_lock);
mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
down_read(&sb->s_umount);
- if (likely(!sb->s_root)) {
- /* fs is already shut down; the watch is dead */
- drop_super(sb);
- return 0;
- }
- /* raced with the final deactivate_super() */
- mutex_lock(&ih->mutex);
- if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
- /* the watch is dead */
- mutex_unlock(&ih->mutex);
- drop_super(sb);
- return 0;
- }
- /* still alive or freed and reused with the same sb and wd; kill */
- get_inotify_watch(watch);
- mutex_unlock(&ih->mutex);
- return 2;
+ /* fs is already shut down; the watch is dead */
+ drop_super(sb);
+ return 0;
}
-static void unpin_and_kill(struct inotify_watch *watch, int how)
+static void unpin_and_kill(struct inotify_watch *watch)
{
struct super_block *sb = watch->inode->i_sb;
put_inotify_watch(watch);
- switch (how) {
- case 1:
- deactivate_super(sb);
- break;
- case 2:
- drop_super(sb);
- }
+ deactivate_super(sb);
}
/**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
struct list_head *watches;
struct super_block *sb;
struct inode *inode;
- int how;
mutex_lock(&ih->mutex);
watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
}
watch = list_first_entry(watches, struct inotify_watch, h_list);
sb = watch->inode->i_sb;
- how = pin_to_kill(ih, watch);
- if (!how)
+ if (!pin_to_kill(ih, watch))
continue;
inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
mutex_unlock(&ih->mutex);
mutex_unlock(&inode->inotify_mutex);
- unpin_and_kill(watch, how);
+ unpin_and_kill(watch);
}
/* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
struct inotify_watch *watch;
struct super_block *sb;
struct inode *inode;
- int how;
mutex_lock(&ih->mutex);
watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
return -EINVAL;
}
sb = watch->inode->i_sb;
- how = pin_to_kill(ih, watch);
- if (!how)
+ if (!pin_to_kill(ih, watch))
return 0;
inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
mutex_unlock(&ih->mutex);
mutex_unlock(&inode->inotify_mutex);
- unpin_and_kill(watch, how);
+ unpin_and_kill(watch);
return 0;
}
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcd..da702294d7e 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -489,7 +489,7 @@ cleanup:
return ret;
}
-struct xattr_handler ocfs2_xattr_acl_access_handler = {
+const struct xattr_handler ocfs2_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.list = ocfs2_xattr_list_acl_access,
@@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
.set = ocfs2_xattr_set_acl,
};
-struct xattr_handler ocfs2_xattr_acl_default_handler = {
+const struct xattr_handler ocfs2_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index db5dd3ed4df..f171b51a74f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
inode->i_nlink = 2;
else
inode->i_nlink = 1;
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
return inode;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 98ee6c44102..e97b34842cf 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -97,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
};
-struct xattr_handler *ocfs2_xattr_handlers[] = {
+const struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
&ocfs2_xattr_acl_access_handler,
&ocfs2_xattr_acl_default_handler,
@@ -106,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
NULL
};
-static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
+static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
= &ocfs2_xattr_acl_access_handler,
@@ -540,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
static inline const char *ocfs2_xattr_prefix(int name_index)
{
- struct xattr_handler *handler = NULL;
+ const struct xattr_handler *handler = NULL;
if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
handler = ocfs2_xattr_handler_map[name_index];
@@ -7213,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
xattr_ac, data_ac);
}
-struct xattr_handler ocfs2_xattr_security_handler = {
+const struct xattr_handler ocfs2_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.list = ocfs2_xattr_security_list,
.get = ocfs2_xattr_security_get,
@@ -7257,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_trusted_handler = {
+const struct xattr_handler ocfs2_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.list = ocfs2_xattr_trusted_list,
.get = ocfs2_xattr_trusted_get,
@@ -7313,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-struct xattr_handler ocfs2_xattr_user_handler = {
+const struct xattr_handler ocfs2_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.list = ocfs2_xattr_user_list,
.get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f52..aa64bb37a65 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
size_t value_len;
};
-extern struct xattr_handler ocfs2_xattr_user_handler;
-extern struct xattr_handler ocfs2_xattr_trusted_handler;
-extern struct xattr_handler ocfs2_xattr_security_handler;
-extern struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-extern struct xattr_handler *ocfs2_xattr_handlers[];
+extern const struct xattr_handler ocfs2_xattr_user_handler;
+extern const struct xattr_handler ocfs2_xattr_trusted_handler;
+extern const struct xattr_handler ocfs2_xattr_security_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
+extern const struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b44bb835e8e..089839a6cc6 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
goto fail;
inode->i_ino = new_block;
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718..5463266db9e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
-#include <linux/vfs.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
@@ -33,171 +32,6 @@
#include "internal.h"
-int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- int retval = -ENODEV;
-
- if (dentry) {
- retval = -ENOSYS;
- if (dentry->d_sb->s_op->statfs) {
- memset(buf, 0, sizeof(*buf));
- retval = security_sb_statfs(dentry);
- if (retval)
- return retval;
- retval = dentry->d_sb->s_op->statfs(dentry, buf);
- if (retval == 0 && buf->f_frsize == 0)
- buf->f_frsize = buf->f_bsize;
- }
- }
- return retval;
-}
-
-EXPORT_SYMBOL(vfs_statfs);
-
-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
-{
- struct kstatfs st;
- int retval;
-
- retval = vfs_statfs(dentry, &st);
- if (retval)
- return retval;
-
- if (sizeof(*buf) == sizeof(st))
- memcpy(buf, &st, sizeof(st));
- else {
- if (sizeof buf->f_blocks == 4) {
- if ((st.f_blocks | st.f_bfree | st.f_bavail |
- st.f_bsize | st.f_frsize) &
- 0xffffffff00000000ULL)
- return -EOVERFLOW;
- /*
- * f_files and f_ffree may be -1; it's okay to stuff
- * that into 32 bits
- */
- if (st.f_files != -1 &&
- (st.f_files & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- if (st.f_ffree != -1 &&
- (st.f_ffree & 0xffffffff00000000ULL))
- return -EOVERFLOW;
- }
-
- buf->f_type = st.f_type;
- buf->f_bsize = st.f_bsize;
- buf->f_blocks = st.f_blocks;
- buf->f_bfree = st.f_bfree;
- buf->f_bavail = st.f_bavail;
- buf->f_files = st.f_files;
- buf->f_ffree = st.f_ffree;
- buf->f_fsid = st.f_fsid;
- buf->f_namelen = st.f_namelen;
- buf->f_frsize = st.f_frsize;
- memset(buf->f_spare, 0, sizeof(buf->f_spare));
- }
- return 0;
-}
-
-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
-{
- struct kstatfs st;
- int retval;
-
- retval = vfs_statfs(dentry, &st);
- if (retval)
- return retval;
-
- if (sizeof(*buf) == sizeof(st))
- memcpy(buf, &st, sizeof(st));
- else {
- buf->f_type = st.f_type;
- buf->f_bsize = st.f_bsize;
- buf->f_blocks = st.f_blocks;
- buf->f_bfree = st.f_bfree;
- buf->f_bavail = st.f_bavail;
- buf->f_files = st.f_files;
- buf->f_ffree = st.f_ffree;
- buf->f_fsid = st.f_fsid;
- buf->f_namelen = st.f_namelen;
- buf->f_frsize = st.f_frsize;
- memset(buf->f_spare, 0, sizeof(buf->f_spare));
- }
- return 0;
-}
-
-SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
-{
- struct path path;
- int error;
-
- error = user_path(pathname, &path);
- if (!error) {
- struct statfs tmp;
- error = vfs_statfs_native(path.dentry, &tmp);
- if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
- error = -EFAULT;
- path_put(&path);
- }
- return error;
-}
-
-SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
-{
- struct path path;
- long error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
- error = user_path(pathname, &path);
- if (!error) {
- struct statfs64 tmp;
- error = vfs_statfs64(path.dentry, &tmp);
- if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
- error = -EFAULT;
- path_put(&path);
- }
- return error;
-}
-
-SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
-{
- struct file * file;
- struct statfs tmp;
- int error;
-
- error = -EBADF;
- file = fget(fd);
- if (!file)
- goto out;
- error = vfs_statfs_native(file->f_path.dentry, &tmp);
- if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
- error = -EFAULT;
- fput(file);
-out:
- return error;
-}
-
-SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
-{
- struct file * file;
- struct statfs64 tmp;
- int error;
-
- if (sz != sizeof(*buf))
- return -EINVAL;
-
- error = -EBADF;
- file = fget(fd);
- if (!file)
- goto out;
- error = vfs_statfs64(file->f_path.dentry, &tmp);
- if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
- error = -EFAULT;
- fput(file);
-out:
- return error;
-}
-
int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
struct file *filp)
{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0f..6921e7890be 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
- unsigned long first_sect, int slot, unsigned long nr_sects)
+static int riscix_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
{
Sector sect;
struct riscix_record *rr;
- rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect);
+ rr = read_part_sector(state, first_sect, &sect);
if (!rr)
return -1;
@@ -123,9 +123,9 @@ struct linux_part {
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
- unsigned long first_sect, int slot, unsigned long nr_sects)
+static int linux_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
{
Sector sect;
struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
put_partition(state, slot++, first_sect, size);
- linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect);
+ linuxp = read_part_sector(state, first_sect, &sect);
if (!linuxp)
return -1;
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
#endif
#ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_CUMANA(struct parsed_partitions *state)
{
unsigned long first_sector = 0;
unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
struct adfs_discrecord *dr;
unsigned int nr_sects;
- data = read_dev_sector(bdev, start_blk * 2 + 6, &sect);
+ data = read_part_sector(state, start_blk * 2 + 6, &sect);
if (!data)
return -1;
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
/* RISCiX - we don't know how to find the next one. */
- slot = riscix_partition(state, bdev, first_sector,
- slot, nr_sects);
+ slot = riscix_partition(state, first_sector, slot,
+ nr_sects);
break;
#endif
case PARTITION_LINUX:
- slot = linux_partition(state, bdev, first_sector,
- slot, nr_sects);
+ slot = linux_partition(state, first_sector, slot,
+ nr_sects);
break;
}
put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
* hda1 = ADFS partition on first drive.
* hda2 = non-ADFS partition.
*/
-int
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ADFS(struct parsed_partitions *state)
{
unsigned long start_sect, nr_sects, sectscyl, heads;
Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
unsigned char id;
int slot = 1;
- data = read_dev_sector(bdev, 6, &sect);
+ data = read_part_sector(state, 6, &sect);
if (!data)
return -1;
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
/*
* Work out start of non-adfs partition.
*/
- nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+ nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
if (start_sect) {
switch (id) {
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
case PARTITION_RISCIX_MFM:
- slot = riscix_partition(state, bdev, start_sect,
- slot, nr_sects);
+ slot = riscix_partition(state, start_sect, slot,
+ nr_sects);
break;
#endif
case PARTITION_LINUX:
- slot = linux_partition(state, bdev, start_sect,
- slot, nr_sects);
+ slot = linux_partition(state, start_sect, slot,
+ nr_sects);
break;
}
}
@@ -308,10 +306,11 @@ struct ics_part {
__le32 size;
};
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+ unsigned long block)
{
Sector sect;
- unsigned char *data = read_dev_sector(bdev, block, &sect);
+ unsigned char *data = read_part_sector(state, block, &sect);
int result = 0;
if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
-int
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ICS(struct parsed_partitions *state)
{
const unsigned char *data;
const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
/*
* Try ICS style partitions - sector 0 contains partition info.
*/
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
* partition is. We must not make this visible
* to the filesystem.
*/
- if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+ if (size > 1 && adfspart_check_ICSLinux(state, start)) {
start += 1;
size -= 1;
}
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
-int
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
int slot = 1;
int i;
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
* 1. The individual ADFS boot block entries that are placed on the disk.
* 2. The start address of the next entry.
*/
-int
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_EESOX(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
sector_t start = 0;
int i, slot = 1;
- data = read_dev_sector(bdev, 7, &sect);
+ data = read_part_sector(state, 7, &sect);
if (!data)
return -1;
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
if (i != 0) {
sector_t size;
- size = get_capacity(bdev->bd_disk);
+ size = get_capacity(state->bdev->bd_disk);
put_partition(state, slot++, start, size - start);
printk("\n");
}
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc08..ede82852969 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
* format, and everyone stick to it?
*/
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f..ba443d4229f 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
return sum;
}
-int
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
+int amiga_partition(struct parsed_partitions *state)
{
Sector sect;
unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
for (blk = 0; ; blk++, put_dev_sector(sect)) {
if (blk == RDB_ALLOCATION_LIMIT)
goto rdb_done;
- data = read_dev_sector(bdev, blk, &sect);
+ data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read RDB block %d\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
}
printk("Dev %s: RDB in block %d has bad checksum\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
}
/* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
blk *= blksize; /* Read in terms partition table understands */
- data = read_dev_sector(bdev, blk, &sect);
+ data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read partition block %d\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d5..d094585cada 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
* fs/partitions/amiga.h
*/
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b75..4439ff1b6ce 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
memcmp (s, "RAW", 3) == 0 ;
}
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
{
Sector sect;
struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
#endif
- rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect);
+ rs = read_part_sector(state, 0, &sect);
if (!rs)
return -1;
/* Verify this is an Atari rootsector: */
- hd_size = bdev->bd_inode->i_size >> 9;
+ hd_size = state->bdev->bd_inode->i_size >> 9;
if (!VALID_PARTITION(&rs->part[0], hd_size) &&
!VALID_PARTITION(&rs->part[1], hd_size) &&
!VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
printk(" XGM<");
partsect = extensect = be32_to_cpu(pi->st);
while (1) {
- xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2);
+ xrs = read_part_sector(state, partsect, &sect2);
if (!xrs) {
printk (" block %ld read failed\n", partsect);
put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e13..fe2d32a89f3 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
u16 checksum; /* checksum for bootable disks */
} __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e..5dcd4b0c553 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
/*
* Probe partition formats with tables at disk address 0
* that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
struct parsed_partitions *state;
int i, res, err;
- state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+ state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
if (!state)
return NULL;
+ state->bdev = bdev;
disk_name(hd, 0, state->name);
printk(KERN_INFO " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
i = res = err = 0;
while (!res && check_part[i]) {
memset(&state->parts, 0, sizeof(state->parts));
- res = check_part[i++](state, bdev);
+ res = check_part[i++](state);
if (res < 0) {
/* We have hit an I/O error which we don't report now.
* But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
}
if (res > 0)
return state;
+ if (state->access_beyond_eod)
+ err = -ENOSPC;
if (err)
/* The partition is unrecognized. So report I/O errors if there were any */
res = err;
@@ -538,12 +541,33 @@ exit:
disk_part_iter_exit(&piter);
}
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+ const struct block_device_operations *bdops = disk->fops;
+
+ if (bdops->unlock_native_capacity &&
+ !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+ printk(KERN_CONT "enabling native capacity\n");
+ bdops->unlock_native_capacity(disk);
+ disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+ return true;
+ } else {
+ printk(KERN_CONT "truncated\n");
+ return false;
+ }
+}
+
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
{
+ struct parsed_partitions *state = NULL;
struct disk_part_iter piter;
struct hd_struct *part;
- struct parsed_partitions *state;
int p, highest, res;
+rescan:
+ if (state && !IS_ERR(state)) {
+ kfree(state);
+ state = NULL;
+ }
if (bdev->bd_part_count)
return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
- if (IS_ERR(state)) /* I/O error reading the partition table */
+ if (IS_ERR(state)) {
+ /*
+ * I/O error reading the partition table. If any
+ * partition code tried to read beyond EOD, retry
+ * after unlocking native capacity.
+ */
+ if (PTR_ERR(state) == -ENOSPC) {
+ printk(KERN_WARNING "%s: partition table beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
return -EIO;
+ }
+ /*
+ * If any partition code tried to read beyond EOD, try
+ * unlocking native capacity even if partition table is
+ * sucessfully read as we could be missing some partitions.
+ */
+ if (state->access_beyond_eod) {
+ printk(KERN_WARNING
+ "%s: partition table partially beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
/* add partitions */
for (p = 1; p < state->limit; p++) {
sector_t size, from;
-try_scan:
+
size = state->parts[p].size;
if (!size)
continue;
@@ -589,30 +637,21 @@ try_scan:
from = state->parts[p].from;
if (from >= get_capacity(disk)) {
printk(KERN_WARNING
- "%s: p%d ignored, start %llu is behind the end of the disk\n",
+ "%s: p%d start %llu is beyond EOD, ",
disk->disk_name, p, (unsigned long long) from);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
continue;
}
if (from + size > get_capacity(disk)) {
- const struct block_device_operations *bdops = disk->fops;
- unsigned long long capacity;
-
printk(KERN_WARNING
- "%s: p%d size %llu exceeds device capacity, ",
+ "%s: p%d size %llu extends beyond EOD, ",
disk->disk_name, p, (unsigned long long) size);
- if (bdops->set_capacity &&
- (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
- printk(KERN_CONT "enabling native capacity\n");
- capacity = bdops->set_capacity(disk, ~0ULL);
- disk->flags |= GENHD_FL_NATIVE_CAPACITY;
- if (capacity > get_capacity(disk)) {
- set_capacity(disk, capacity);
- check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
- }
- goto try_scan;
+ if (disk_unlock_native_capacity(disk)) {
+ /* free state and restart */
+ goto rescan;
} else {
/*
* we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
* we limit them to the end of the disk to avoid
* creating invalid block devices
*/
- printk(KERN_CONT "limited to end of disk\n");
size = get_capacity(disk) - from;
}
}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a8452..52f8bd39939 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
* description.
*/
struct parsed_partitions {
+ struct block_device *bdev;
char name[BDEVNAME_SIZE];
struct {
sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
} parts[DISK_MAX_PARTS];
int next;
int limit;
+ bool access_beyond_eod;
};
+static inline void *read_part_sector(struct parsed_partitions *state,
+ sector_t n, Sector *p)
+{
+ if (n >= get_capacity(state->bdev->bd_disk)) {
+ state->access_beyond_eod = true;
+ return NULL;
+ }
+ return read_dev_sector(state->bdev, n, p);
+}
+
static inline void
put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae758..9efb2cfe241 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
-static u64
-last_lba(struct block_device *bdev)
+static u64 last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
/**
* read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
* @lba
* @buffer
* @size_t
*
- * Description: Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
-static size_t
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+static size_t read_lba(struct parsed_partitions *state,
+ u64 lba, u8 *buffer, size_t count)
{
size_t totalreadcount = 0;
+ struct block_device *bdev = state->bdev;
sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
- if (!bdev || !buffer || lba > last_lba(bdev))
+ if (!buffer || lba > last_lba(bdev))
return 0;
while (count) {
int copied = 512;
Sector sect;
- unsigned char *data = read_dev_sector(bdev, n++, &sect);
+ unsigned char *data = read_part_sector(state, n++, &sect);
if (!data)
break;
if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
/**
* alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
* @gpt - GPT header
*
* Description: Returns ptes on success, NULL on error.
* Allocates space for PTEs based on information found in @gpt.
* Notes: remember to free pte when you're done!
*/
-static gpt_entry *
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+ gpt_header *gpt)
{
size_t count;
gpt_entry *pte;
- if (!bdev || !gpt)
+
+ if (!gpt)
return NULL;
count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
if (!pte)
return NULL;
- if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+ if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
(u8 *) pte,
count) < count) {
kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
/**
* alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
* @lba is the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
* Note: remember to free gpt when finished with it.
*/
-static gpt_header *
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+ u64 lba)
{
gpt_header *gpt;
- unsigned ssz = bdev_logical_block_size(bdev);
-
- if (!bdev)
- return NULL;
+ unsigned ssz = bdev_logical_block_size(state->bdev);
gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
- if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+ if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
/**
* is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
* @lba is the logical block address of the GPT header to test
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
* Description: returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
*/
-static int
-is_gpt_valid(struct block_device *bdev, u64 lba,
- gpt_header **gpt, gpt_entry **ptes)
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+ gpt_header **gpt, gpt_entry **ptes)
{
u32 crc, origcrc;
u64 lastlba;
- if (!bdev || !gpt || !ptes)
+ if (!ptes)
return 0;
- if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+ if (!(*gpt = alloc_read_gpt_header(state, lba)))
return 0;
/* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
/* Check the first_usable_lba and last_usable_lba are
* within the disk.
*/
- lastlba = last_lba(bdev);
+ lastlba = last_lba(state->bdev);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
goto fail;
}
- if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+ if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
goto fail;
/* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
/**
* find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
* Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
* This protects against devices which misreport their size, and forces
* the user to decide to use the Alternate GPT.
*/
-static int
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+ gpt_entry **ptes)
{
int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
u64 lastlba;
- if (!bdev || !gpt || !ptes)
+
+ if (!ptes)
return 0;
- lastlba = last_lba(bdev);
+ lastlba = last_lba(state->bdev);
if (!force_gpt) {
/* This will be added to the EFI Spec. per Intel after v1.02. */
legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
if (legacymbr) {
- read_lba(bdev, 0, (u8 *) legacymbr,
- sizeof (*legacymbr));
+ read_lba(state, 0, (u8 *) legacymbr,
+ sizeof (*legacymbr));
good_pmbr = is_pmbr_valid(legacymbr);
kfree(legacymbr);
}
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
goto fail;
}
- good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+ good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
&pgpt, &pptes);
if (good_pgpt)
- good_agpt = is_gpt_valid(bdev,
+ good_agpt = is_gpt_valid(state,
le64_to_cpu(pgpt->alternate_lba),
&agpt, &aptes);
if (!good_agpt && force_gpt)
- good_agpt = is_gpt_valid(bdev, lastlba,
- &agpt, &aptes);
+ good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
/* The obviously unsuccessful case */
if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
}
/**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
* @state
- * @bdev
*
* Description: called from check.c, if the disk contains GPT
* partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
* 1 if successful
*
*/
-int
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int efi_partition(struct parsed_partitions *state)
{
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
- unsigned ssz = bdev_logical_block_size(bdev) / 512;
+ unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
- if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+ if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
kfree(ptes);
return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
- if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+ if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
continue;
put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid,
PARTITION_LINUX_RAID_GUID))
- state->parts[i+1].flags = 1;
+ state->parts[i + 1].flags = ADDPART_FLAG_RAID;
}
kfree(ptes);
kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf..b69ab729558 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
} __attribute__ ((packed)) legacy_mbr;
/* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
#endif
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab0846..3e73de5967f 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
/*
*/
-int
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ibm_partition(struct parsed_partitions *state)
{
+ struct block_device *bdev = state->bdev;
int blocksize, res;
loff_t i_size, offset, size, fmt_size;
dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
/*
* Get volume label, extract name and type.
*/
- data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect);
+ data = read_part_sector(state, info->label_block*(blocksize/512),
+ &sect);
if (data == NULL)
goto out_readerr;
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
*/
blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
counter = 0;
- data = read_dev_sector(bdev, blk * (blocksize/512),
- &sect);
+ data = read_part_sector(state, blk * (blocksize/512),
+ &sect);
while (data != NULL) {
struct vtoc_format1_label f1;
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
|| f1.DS1FMTID == _ascebc['7']
|| f1.DS1FMTID == _ascebc['9']) {
blk++;
- data = read_dev_sector(bdev, blk *
- (blocksize/512),
- &sect);
+ data = read_part_sector(state,
+ blk * (blocksize/512), &sect);
continue;
}
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
size * (blocksize >> 9));
counter++;
blk++;
- data = read_dev_sector(bdev,
- blk * (blocksize/512),
- &sect);
+ data = read_part_sector(state,
+ blk * (blocksize/512), &sect);
}
if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac45..08fb0804a81 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf12..1cc928bb762 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
#include "check.h"
#include "karma.h"
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
} __attribute__((packed)) *label;
struct d_partition *p;
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d..c764b2e9df2 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
#define KARMA_LABEL_MAGIC 0xAB56
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e96..3ceca05b668 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
/**
* ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
* @ph1: Memory struct to fill with ph contents
*
* Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
* Return: 'true' Success
* 'false' Error
*/
-static bool ldm_validate_privheads (struct block_device *bdev,
- struct privhead *ph1)
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+ struct privhead *ph1)
{
static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
long num_sects;
int i;
- BUG_ON (!bdev || !ph1);
+ BUG_ON (!state || !ph1);
ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
/* Read and parse privheads */
for (i = 0; i < 3; i++) {
- data = read_dev_sector (bdev,
- ph[0]->config_start + off[i], &sect);
+ data = read_part_sector(state, ph[0]->config_start + off[i],
+ &sect);
if (!data) {
ldm_crit ("Disk read failed.");
goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
}
}
- num_sects = bdev->bd_inode->i_size >> 9;
+ num_sects = state->bdev->bd_inode->i_size >> 9;
if ((ph[0]->config_start > num_sects) ||
((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
/**
* ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev: Device holding the LDM Database
- * @base: Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
* @ldb: Cache of the database structures
*
* Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
*
* The offsets and sizes of the configs are range-checked against a privhead.
*
* Return: 'true' @toc1 contains validated TOCBLOCK info
* 'false' @toc1 contents are undefined
*/
-static bool ldm_validate_tocblocks(struct block_device *bdev,
- unsigned long base, struct ldmdb *ldb)
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
{
static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
int i, nr_tbs;
bool result = false;
- BUG_ON(!bdev || !ldb);
+ BUG_ON(!state || !ldb);
ph = &ldb->ph;
tb[0] = &ldb->toc;
tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
* skip any that fail as long as we get at least one valid TOCBLOCK.
*/
for (nr_tbs = i = 0; i < 4; i++) {
- data = read_dev_sector(bdev, base + off[i], &sect);
+ data = read_part_sector(state, base + off[i], &sect);
if (!data) {
ldm_error("Disk read failed for TOCBLOCK %d.", i);
continue;
@@ -473,7 +473,7 @@ err:
/**
* ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
* @base: Offset, into @bdev, of the database
* @ldb: Cache of the database structures
*
@@ -483,8 +483,8 @@ err:
* Return: 'true' @ldb contains validated VBDB info
* 'false' @ldb contents are undefined
*/
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
- struct ldmdb *ldb)
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
{
Sector sect;
u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
struct vmdb *vm;
struct tocblock *toc;
- BUG_ON (!bdev || !ldb);
+ BUG_ON (!state || !ldb);
vm = &ldb->vm;
toc = &ldb->toc;
- data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
+ data = read_part_sector(state, base + OFF_VMDB, &sect);
if (!data) {
ldm_crit ("Disk read failed.");
return false;
@@ -534,21 +534,21 @@ out:
/**
* ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
*
* This function provides a weak test to decide whether the device is a dynamic
* disk or not. It looks for an MS-DOS-style partition table containing at
* least one partition of type 0x42 (formerly SFS, now used by Windows for
* dynamic disks).
*
- * N.B. The only possible error can come from the read_dev_sector and that is
+ * N.B. The only possible error can come from the read_part_sector and that is
* only likely to happen if the underlying device is strange. If that IS
* the case we should return zero to let someone else try.
*
- * Return: 'true' @bdev is a dynamic disk
- * 'false' @bdev is not a dynamic disk, or an error occurred
+ * Return: 'true' @state->bdev is a dynamic disk
+ * 'false' @state->bdev is not a dynamic disk, or an error occurred
*/
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
{
Sector sect;
u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
int i;
bool result = false;
- BUG_ON (!bdev);
+ BUG_ON(!state);
- data = read_dev_sector (bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data) {
ldm_crit ("Disk read failed.");
return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
/**
* ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev: Device holding the LDM Database
- * @base: Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
* @ldb: Cache of the database structures
*
* To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
* Return: 'true' All the VBLKs were read successfully
* 'false' An error occurred
*/
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
- struct ldmdb *ldb)
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+ struct ldmdb *ldb)
{
int size, perbuf, skip, finish, s, v, recs;
u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
bool result = false;
LIST_HEAD (frags);
- BUG_ON (!bdev || !ldb);
+ BUG_ON(!state || !ldb);
size = ldb->vm.vblk_size;
perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
finish = (size * ldb->vm.last_vblk_seq) >> 9;
for (s = skip; s < finish; s++) { /* For each sector */
- data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect);
+ data = read_part_sector(state, base + OFF_VMDB + s, &sect);
if (!data) {
ldm_crit ("Disk read failed.");
goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
/**
* ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp: List of the partitions parsed so far
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
*
* This determines whether the device @bdev is a dynamic disk and if so creates
* the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
* example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
* and so on: the actual data containing partitions.
*
- * Return: 1 Success, @bdev is a dynamic disk and we handled it
- * 0 Success, @bdev is not a dynamic disk
+ * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
+ * 0 Success, @state->bdev is not a dynamic disk
* -1 An error occurred before enough information had been read
- * Or @bdev is a dynamic disk, but it may be corrupted
+ * Or @state->bdev is a dynamic disk, but it may be corrupted
*/
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
{
struct ldmdb *ldb;
unsigned long base;
int result = -1;
- BUG_ON (!pp || !bdev);
+ BUG_ON(!state);
/* Look for signs of a Dynamic Disk */
- if (!ldm_validate_partition_table (bdev))
+ if (!ldm_validate_partition_table(state))
return 0;
ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
}
/* Parse and check privheads. */
- if (!ldm_validate_privheads (bdev, &ldb->ph))
+ if (!ldm_validate_privheads(state, &ldb->ph))
goto out; /* Already logged */
/* All further references are relative to base (database start). */
base = ldb->ph.config_start;
/* Parse and check tocs and vmdb. */
- if (!ldm_validate_tocblocks (bdev, base, ldb) ||
- !ldm_validate_vmdb (bdev, base, ldb))
+ if (!ldm_validate_tocblocks(state, base, ldb) ||
+ !ldm_validate_vmdb(state, base, ldb))
goto out; /* Already logged */
/* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
INIT_LIST_HEAD (&ldb->v_comp);
INIT_LIST_HEAD (&ldb->v_part);
- if (!ldm_get_vblks (bdev, base, ldb)) {
+ if (!ldm_get_vblks(state, base, ldb)) {
ldm_crit ("Failed to read the VBLKs from the database.");
goto cleanup;
}
/* Finally, create the data partition devices. */
- if (ldm_create_data_partitions (pp, ldb)) {
+ if (ldm_create_data_partitions(state, ldb)) {
ldm_debug ("Parsed LDM database successfully.");
result = 1;
}
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1..d1fb50b28d8 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
struct list_head v_part;
};
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
#endif /* _FS_PT_LDM_H_ */
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563..74465ff7c26 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
stg[i] = 0;
}
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
{
int slot = 1;
Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
struct mac_driver_desc *md;
/* Get 0th block and look at the first partition map entry. */
- md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect);
+ md = read_part_sector(state, 0, &sect);
if (!md)
return -1;
if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
- data = read_dev_sector(bdev, secsize/512, &sect);
+ data = read_part_sector(state, secsize/512, &sect);
if (!data)
return -1;
part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
for (blk = 1; blk <= blocks_in_map; ++blk) {
int pos = blk * secsize;
put_dev_sector(sect);
- data = read_dev_sector(bdev, pos/512, &sect);
+ data = read_part_sector(state, pos/512, &sect);
if (!data)
return -1;
part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
be32_to_cpu(part->block_count) * (secsize/512));
if (!strnicmp(part->type, "Linux_RAID", 10))
- state->parts[slot].flags = 1;
+ state->parts[slot].flags = ADDPART_FLAG_RAID;
#ifdef CONFIG_PPC_PMAC
/*
* If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
}
#ifdef CONFIG_PPC_PMAC
if (found_root_goodness)
- note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+ note_bootable_part(state->bdev->bd_dev, found_root,
+ found_root_goodness);
#endif
put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386f..3c7d9843638 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
/* ... more stuff */
};
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a..15bfb7b1e04 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
#define AIX_LABEL_MAGIC2 0xC2
#define AIX_LABEL_MAGIC3 0xD4
#define AIX_LABEL_MAGIC4 0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
{
struct partition *pt = (struct partition *) (p + 0x1be);
Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
is_extended_partition(pt))
return 0;
}
- d = read_dev_sector(bdev, 7, &sect);
+ d = read_part_sector(state, 7, &sect);
if (d) {
if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
* only for the actual data partitions.
*/
-static void
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
- sector_t first_sector, sector_t first_size)
+static void parse_extended(struct parsed_partitions *state,
+ sector_t first_sector, sector_t first_size)
{
struct partition *p;
Sector sect;
unsigned char *data;
sector_t this_sector, this_size;
- sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
return;
if (state->next == state->limit)
return;
- data = read_dev_sector(bdev, this_sector, &sect);
+ data = read_part_sector(state, this_sector, &sect);
if (!data)
return;
@@ -198,9 +197,8 @@ done:
/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
indicates linux swap. Be careful before believing this is Solaris. */
-static void
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_solaris_x86(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_SOLARIS_X86_PARTITION
Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
int i;
short max_nparts;
- v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect);
+ v = read_part_sector(state, offset + 1, &sect);
if (!v)
return;
if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
* Create devices for BSD partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
-static void
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin, char *flavour,
- int max_partitions)
+static void parse_bsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin, char *flavour,
+ int max_partitions)
{
Sector sect;
struct bsd_disklabel *l;
struct bsd_partition *p;
- l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect);
+ l = read_part_sector(state, offset + 1, &sect);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
}
#endif
-static void
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_freebsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "bsd", BSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
#endif
}
-static void
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_netbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "netbsd", BSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
#endif
}
-static void
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_openbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "openbsd", OPENBSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "openbsd",
+ OPENBSD_MAXPARTITIONS);
#endif
}
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
* Create devices for Unixware partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
-static void
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_unixware(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_UNIXWARE_DISKLABEL
Sector sect;
struct unixware_disklabel *l;
struct unixware_slice *p;
- l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect);
+ l = read_part_sector(state, offset + 29, &sect);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
* Anand Krishnamurthy <anandk@wiproge.med.ge.com>
* Rajeev V. Pillai <rajeevvp@yahoo.com>
*/
-static void
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_minix(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_MINIX_SUBPARTITION
Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
struct partition *p;
int i;
- data = read_dev_sector(bdev, offset, &sect);
+ data = read_part_sector(state, offset, &sect);
if (!data)
return;
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
static struct {
unsigned char id;
- void (*parse)(struct parsed_partitions *, struct block_device *,
- sector_t, sector_t, int);
+ void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
} subtypes[] = {
{FREEBSD_PARTITION, parse_freebsd},
{NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
{0, NULL},
};
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
{
- sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
Sector sect;
unsigned char *data;
struct partition *p;
struct fat_boot_sector *fb;
int slot;
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
return 0;
}
- if (aix_magic_present(data, bdev)) {
+ if (aix_magic_present(state, data)) {
put_dev_sector(sect);
printk( " [AIX]");
return 0;
@@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
put_partition(state, slot, start, n);
printk(" <");
- parse_extended(state, bdev, start, size);
+ parse_extended(state, start, size);
printk(" >");
continue;
}
put_partition(state, slot, start, size);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
- state->parts[slot].flags = 1;
+ state->parts[slot].flags = ADDPART_FLAG_RAID;
if (SYS_IND(p) == DM6_PARTITION)
printk("[DM]");
if (SYS_IND(p) == EZD_PARTITION)
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
if (!subtypes[n].parse)
continue;
- subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
- nr_sects(p)*sector_size, slot);
+ subtypes[n].parse(state, start_sect(p) * sector_size,
+ nr_sects(p) * sector_size, slot);
}
put_dev_sector(sect);
return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902..38c781c490b 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
#define MSDOS_LABEL_MAGIC 0xAA55
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df..fc22b85d436 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
#include "check.h"
#include "osf.h"
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
} * label;
struct d_partition * partition;
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314..20ed2315ec1 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
#define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83..43b1df9aa16 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
__be32 _unused1; /* Padding */
};
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
{
int i, csum;
__be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
struct sgi_partition *p;
char b[BDEVNAME_SIZE];
- label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect);
+ label = read_part_sector(state, 0, &sect);
if (!label)
return -1;
p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
}
if(csum) {
printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
- bdevname(bdev, b));
+ bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c0992..b9553ebdd5a 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
* fs/partitions/sgi.h
*/
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
#define SGI_LABEL_MAGIC 0x0be5a941
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01..a32660e25f7 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
#include "check.h"
#include "sun.h"
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
{
int i;
__be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
int use_vtoc;
int nparts;
- label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect);
+ label = read_part_sector(state, 0, &sect);
if (!label)
return -1;
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
csum ^= *ush--;
if (csum) {
printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
- bdevname(bdev, b));
+ bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d..2424baa8319 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
#define SUN_LABEL_MAGIC 0xDABE
#define SUN_VTOC_SANITY 0x600DDEEE
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b7864..9030c864428 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
};
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
{
int i, slices;
int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
struct dkblk0 *b;
struct slice *slice;
- data = read_dev_sector(bdev, 0, &sect);
+ data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
i = be32_to_cpu(b->dk_ios.ios_slcblk);
put_dev_sector(sect);
- data = read_dev_sector(bdev, i, &sect);
+ data = read_part_sector(state, i, &sect);
if (!data)
return -1;
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431..bf2f5ffa97a 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce..db9eef26036 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
#include "check.h"
#include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
{
int i;
Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
#define PT_MAGIC 0x032957 /* Partition magic number */
#define PT_VALID 1 /* Indicates if struct is valid */
- data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect);
+ data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
if (!data)
return -1;
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d37..a3cc00b2bde 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
* fs/partitions/ultrix.h
*/
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff315..d79872eba09 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
@@ -18,11 +19,18 @@
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
+#include <linux/fcntl.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+
+/*
* We use a start+len construction, which provides full use of the
* allocated memory.
* -- Florian Coosmann (FGC)
@@ -390,7 +398,7 @@ redo:
if (!buf->len) {
buf->ops = NULL;
ops->release(pipe, buf);
- curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+ curbuf = (curbuf + 1) & (pipe->buffers - 1);
pipe->curbuf = curbuf;
pipe->nrbufs = --bufs;
do_wakeup = 1;
@@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
if (pipe->nrbufs && chars != 0) {
int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
- (PIPE_BUFFERS-1);
+ (pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + lastbuf;
const struct pipe_buf_operations *ops = buf->ops;
int offset = buf->offset + buf->len;
@@ -518,8 +526,8 @@ redo1:
break;
}
bufs = pipe->nrbufs;
- if (bufs < PIPE_BUFFERS) {
- int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+ if (bufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pipe->tmp_page;
char *src;
@@ -580,7 +588,7 @@ redo2:
if (!total_len)
break;
}
- if (bufs < PIPE_BUFFERS)
+ if (bufs < pipe->buffers)
continue;
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
@@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
nrbufs = pipe->nrbufs;
while (--nrbufs >= 0) {
count += pipe->bufs[buf].len;
- buf = (buf+1) & (PIPE_BUFFERS-1);
+ buf = (buf+1) & (pipe->buffers - 1);
}
mutex_unlock(&inode->i_mutex);
@@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait)
}
if (filp->f_mode & FMODE_WRITE) {
- mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+ mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
/*
* Most Unices do not set POLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
@@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
if (pipe) {
- init_waitqueue_head(&pipe->wait);
- pipe->r_counter = pipe->w_counter = 1;
- pipe->inode = inode;
+ pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+ if (pipe->bufs) {
+ init_waitqueue_head(&pipe->wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->inode = inode;
+ pipe->buffers = PIPE_DEF_BUFFERS;
+ return pipe;
+ }
+ kfree(pipe);
}
- return pipe;
+ return NULL;
}
void __free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- for (i = 0; i < PIPE_BUFFERS; i++) {
+ for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
buf->ops->release(pipe, buf);
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
+ kfree(pipe->bufs);
kfree(pipe);
}
@@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
}
/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+ struct pipe_buffer *bufs;
+
+ /*
+ * Must be a power-of-2 currently
+ */
+ if (!is_power_of_2(arg))
+ return -EINVAL;
+
+ /*
+ * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+ * expect a lot of shrink+grow operations, just free and allocate
+ * again like we would do for growing. If the pipe currently
+ * contains more buffers than arg, then return busy.
+ */
+ if (arg < pipe->nrbufs)
+ return -EBUSY;
+
+ bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+ if (unlikely(!bufs))
+ return -ENOMEM;
+
+ /*
+ * The pipe array wraps around, so just start the new one at zero
+ * and adjust the indexes.
+ */
+ if (pipe->nrbufs) {
+ const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+ const unsigned int head = pipe->nrbufs - tail;
+
+ if (head)
+ memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+ if (tail)
+ memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+ }
+
+ pipe->curbuf = 0;
+ kfree(pipe->bufs);
+ pipe->bufs = bufs;
+ pipe->buffers = arg;
+ return arg;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct pipe_inode_info *pipe;
+ long ret;
+
+ pipe = file->f_path.dentry->d_inode->i_pipe;
+ if (!pipe)
+ return -EBADF;
+
+ mutex_lock(&pipe->inode->i_mutex);
+
+ switch (cmd) {
+ case F_SETPIPE_SZ:
+ if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+ return -EINVAL;
+ /*
+ * The pipe needs to be at least 2 pages large to
+ * guarantee POSIX behaviour.
+ */
+ if (arg < 2)
+ return -EINVAL;
+ ret = pipe_set_size(pipe, arg);
+ break;
+ case F_GETPIPE_SZ:
+ ret = pipe->buffers;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&pipe->inode->i_mutex);
+ return ret;
+}
+
+/*
* pipefs should _never_ be mounted by userland - too much of security hassle,
* no real gain from having the whole whorehouse mounted. So we don't need
* any operations on the root directory. However, we need a non-trivial
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index cfc78826da9..ce3dfd066f5 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
return security_quotactl(cmd, type, id, sb);
}
+static void quota_sync_one(struct super_block *sb, void *arg)
+{
+ if (sb->s_qcop && sb->s_qcop->quota_sync)
+ sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
+}
+
static int quota_sync_all(int type)
{
- struct super_block *sb;
int ret;
if (type >= MAXQUOTAS)
return -EINVAL;
ret = security_quotactl(Q_SYNC, type, 0, NULL);
- if (ret)
- return ret;
-
- spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (!sb->s_qcop || !sb->s_qcop->quota_sync)
- continue;
-
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root)
- sb->s_qcop->quota_sync(sb, type, 1);
- up_read(&sb->s_umount);
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
-
- return 0;
+ if (!ret)
+ iterate_supers(quota_sync_one, &type);
+ return ret;
}
static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f47cd212dee..a5ebae70dc6 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
};
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+struct inode *ramfs_get_inode(struct super_block *sb,
+ const struct inode *dir, int mode, dev_t dev)
{
struct inode * inode = new_inode(sb);
if (inode) {
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
static int
ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
- struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+ struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
int error = -ENOSPC;
if (inode) {
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- }
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
struct inode *inode;
int error = -ENOSPC;
- inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+ inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
if (!error) {
- if (dir->i_mode & S_ISGID)
- inode->i_gid = dir->i_gid;
d_instantiate(dentry, inode);
dget(dentry);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -241,7 +233,7 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &ramfs_ops;
sb->s_time_gran = 1;
- inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
+ inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
if (!inode) {
err = -ENOMEM;
goto fail;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5..9977df9f3a5 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp,
barrier_done = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
if (barrier_done < 0)
return barrier_done;
return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ff..ee78d4a0086 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
*/
static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
{
-
- /* the quota init calls have to know who to charge the quota to, so
- ** we have to set uid and gid here
- */
- inode->i_uid = current_fsuid();
- inode->i_mode = mode;
/* Make inode invalid - just in case we are going to drop it before
* the initialization happens */
INODE_PKEY(inode)->k_objectid = 0;
-
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- } else {
- inode->i_gid = current_fsgid();
- }
+ /* the quota init calls have to know who to charge the quota to, so
+ ** we have to set uid and gid here
+ */
+ inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
return 0;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636d..8c4cf273c67 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
(handler) = *(handlers)++)
/* This is the implementation for the xattr plugin infrastructure */
-static inline struct xattr_handler *
-find_xattr_handler_prefix(struct xattr_handler **handlers,
+static inline const struct xattr_handler *
+find_xattr_handler_prefix(const struct xattr_handler **handlers,
const char *name)
{
- struct xattr_handler *xah;
+ const struct xattr_handler *xah;
if (!handlers)
return NULL;
@@ -748,7 +748,7 @@ ssize_t
reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
size_t size)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -767,7 +767,7 @@ int
reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
size_t size, int flags)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
*/
int reiserfs_removexattr(struct dentry *dentry, const char *name)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
size_t size;
if (name[0] != '.' ||
(namelen != 1 && (name[1] != '.' || namelen != 2))) {
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
name);
if (!handler) /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a..536d697a8a2 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
return size;
}
-struct xattr_handler reiserfs_posix_acl_access_handler = {
+const struct xattr_handler reiserfs_posix_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
return size;
}
-struct xattr_handler reiserfs_posix_acl_default_handler = {
+const struct xattr_handler reiserfs_posix_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c04..237c6928d3c 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
sec->value = NULL;
}
-struct xattr_handler reiserfs_xattr_security_handler = {
+const struct xattr_handler reiserfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = security_get,
.set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3da..9883736ce3e 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
return len;
}
-struct xattr_handler reiserfs_xattr_trusted_handler = {
+const struct xattr_handler reiserfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = trusted_get,
.set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b91..45ae1a00013 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
return len;
}
-struct xattr_handler reiserfs_xattr_user_handler = {
+const struct xattr_handler reiserfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.get = user_get,
.set = user_set,
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2..ac22b00d86c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
break;
}
- if (pipe->nrbufs < PIPE_BUFFERS) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+ if (pipe->nrbufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
if (!--spd->nr_pages)
break;
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
continue;
break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
page_cache_release(spd->pages[i]);
}
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+ if (pipe->buffers <= PIPE_DEF_BUFFERS)
+ return 0;
+
+ spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+ if (spd->pages && spd->partial)
+ return 0;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+ return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ if (pipe->buffers <= PIPE_DEF_BUFFERS)
+ return;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+}
+
static int
__generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct address_space *mapping = in->f_mapping;
unsigned int loff, nr_pages, req_pages;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct page *page;
pgoff_t index, end_index;
loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
.spd_release = spd_release_page,
};
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+ nr_pages = min(req_pages, pipe->buffers);
/*
* Lookup the (hopefully) full range of pages we need.
*/
- spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
index += spd.nr_pages;
/*
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
unlock_page(page);
}
- pages[spd.nr_pages++] = page;
+ spd.pages[spd.nr_pages++] = page;
index++;
}
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* this_len is the max we'll use from this page
*/
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
- page = pages[page_nr];
+ page = spd.pages[page_nr];
if (PageReadahead(page))
page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
error = -ENOMEM;
break;
}
- page_cache_release(pages[page_nr]);
- pages[page_nr] = page;
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
len = this_len;
}
- partial[page_nr].offset = loff;
- partial[page_nr].len = this_len;
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
len -= this_len;
loff = 0;
spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
* we got, 'nr_pages' is how many pages are in the map.
*/
while (page_nr < nr_pages)
- page_cache_release(pages[page_nr++]);
+ page_cache_release(spd.pages[page_nr++]);
in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
if (spd.nr_pages)
- return splice_to_pipe(pipe, &spd);
+ error = splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(pipe, &spd);
return error;
}
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
unsigned int nr_pages;
unsigned int nr_freed;
size_t offset;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
- struct iovec vec[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
pgoff_t index;
ssize_t res;
size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
.spd_release = spd_release_page,
};
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ res = -ENOMEM;
+ vec = __vec;
+ if (pipe->buffers > PIPE_DEF_BUFFERS) {
+ vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+ if (!vec)
+ goto shrink_ret;
+ }
+
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+ for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
struct page *page;
page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
vec[i].iov_base = (void __user *) page_address(page);
vec[i].iov_len = this_len;
- pages[i] = page;
+ spd.pages[i] = page;
spd.nr_pages++;
len -= this_len;
offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
nr_freed = 0;
for (i = 0; i < spd.nr_pages; i++) {
this_len = min_t(size_t, vec[i].iov_len, res);
- partial[i].offset = 0;
- partial[i].len = this_len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = this_len;
if (!this_len) {
- __free_page(pages[i]);
- pages[i] = NULL;
+ __free_page(spd.pages[i]);
+ spd.pages[i] = NULL;
nr_freed++;
}
res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
if (res > 0)
*ppos += res;
+shrink_ret:
+ if (vec != __vec)
+ kfree(vec);
+ splice_shrink_spd(pipe, &spd);
return res;
err:
for (i = 0; i < spd.nr_pages; i++)
- __free_page(pages[i]);
+ __free_page(spd.pages[i]);
- return error;
+ res = error;
+ goto shrink_ret;
}
EXPORT_SYMBOL(default_file_splice_read);
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
if (!buf->len) {
buf->ops = NULL;
ops->release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
if (pipe->inode)
sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < PIPE_BUFFERS; i++) {
+ for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
*/
static int get_iovec_page_array(const struct iovec __user *iov,
unsigned int nr_vecs, struct page **pages,
- struct partial_page *partial, int aligned)
+ struct partial_page *partial, int aligned,
+ unsigned int pipe_buffers)
{
int buffers = 0, error = 0;
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
break;
npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (npages > PIPE_BUFFERS - buffers)
- npages = PIPE_BUFFERS - buffers;
+ if (npages > pipe_buffers - buffers)
+ npages = pipe_buffers - buffers;
error = get_user_pages_fast((unsigned long)base, npages,
0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
* or if we mapped the max number of pages that we have
* room for.
*/
- if (error < npages || buffers == PIPE_BUFFERS)
+ if (error < npages || buffers == pipe_buffers)
break;
nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
.ops = &user_page_pipe_buf_ops,
.spd_release = spd_release_page,
};
+ long ret;
pipe = pipe_info(file->f_path.dentry->d_inode);
if (!pipe)
return -EBADF;
- spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
- flags & SPLICE_F_GIFT);
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+ spd.partial, flags & SPLICE_F_GIFT,
+ pipe->buffers);
if (spd.nr_pages <= 0)
- return spd.nr_pages;
+ ret = spd.nr_pages;
+ else
+ ret = splice_to_pipe(pipe, &spd);
- return splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(pipe, &spd);
+ return ret;
}
/*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check ->nrbufs without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe->nrbufs >= PIPE_BUFFERS) {
+ while (pipe->nrbufs >= pipe->buffers) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+ if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
/* Already processed some buffers, break */
if (ret)
break;
@@ -1831,7 +1890,7 @@ retry:
}
ibuf = ipipe->bufs + ipipe->curbuf;
- nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
obuf = opipe->bufs + nbuf;
if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
*obuf = *ibuf;
ibuf->ops = NULL;
opipe->nrbufs++;
- ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+ ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
ipipe->nrbufs--;
input_wakeup = true;
} else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* If we have iterated all input buffers or ran out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+ if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+ ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
/*
* Get a reference to this pipe buffer,
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 00000000000..4ef021f3b61
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
+#include <linux/syscalls.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
+
+int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ int retval = -ENODEV;
+
+ if (dentry) {
+ retval = -ENOSYS;
+ if (dentry->d_sb->s_op->statfs) {
+ memset(buf, 0, sizeof(*buf));
+ retval = security_sb_statfs(dentry);
+ if (retval)
+ return retval;
+ retval = dentry->d_sb->s_op->statfs(dentry, buf);
+ if (retval == 0 && buf->f_frsize == 0)
+ buf->f_frsize = buf->f_bsize;
+ }
+ }
+ return retval;
+}
+
+EXPORT_SYMBOL(vfs_statfs);
+
+static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
+{
+ struct kstatfs st;
+ int retval;
+
+ retval = vfs_statfs(dentry, &st);
+ if (retval)
+ return retval;
+
+ if (sizeof(*buf) == sizeof(st))
+ memcpy(buf, &st, sizeof(st));
+ else {
+ if (sizeof buf->f_blocks == 4) {
+ if ((st.f_blocks | st.f_bfree | st.f_bavail |
+ st.f_bsize | st.f_frsize) &
+ 0xffffffff00000000ULL)
+ return -EOVERFLOW;
+ /*
+ * f_files and f_ffree may be -1; it's okay to stuff
+ * that into 32 bits
+ */
+ if (st.f_files != -1 &&
+ (st.f_files & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ if (st.f_ffree != -1 &&
+ (st.f_ffree & 0xffffffff00000000ULL))
+ return -EOVERFLOW;
+ }
+
+ buf->f_type = st.f_type;
+ buf->f_bsize = st.f_bsize;
+ buf->f_blocks = st.f_blocks;
+ buf->f_bfree = st.f_bfree;
+ buf->f_bavail = st.f_bavail;
+ buf->f_files = st.f_files;
+ buf->f_ffree = st.f_ffree;
+ buf->f_fsid = st.f_fsid;
+ buf->f_namelen = st.f_namelen;
+ buf->f_frsize = st.f_frsize;
+ memset(buf->f_spare, 0, sizeof(buf->f_spare));
+ }
+ return 0;
+}
+
+static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
+{
+ struct kstatfs st;
+ int retval;
+
+ retval = vfs_statfs(dentry, &st);
+ if (retval)
+ return retval;
+
+ if (sizeof(*buf) == sizeof(st))
+ memcpy(buf, &st, sizeof(st));
+ else {
+ buf->f_type = st.f_type;
+ buf->f_bsize = st.f_bsize;
+ buf->f_blocks = st.f_blocks;
+ buf->f_bfree = st.f_bfree;
+ buf->f_bavail = st.f_bavail;
+ buf->f_files = st.f_files;
+ buf->f_ffree = st.f_ffree;
+ buf->f_fsid = st.f_fsid;
+ buf->f_namelen = st.f_namelen;
+ buf->f_frsize = st.f_frsize;
+ memset(buf->f_spare, 0, sizeof(buf->f_spare));
+ }
+ return 0;
+}
+
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
+{
+ struct path path;
+ int error;
+
+ error = user_path(pathname, &path);
+ if (!error) {
+ struct statfs tmp;
+ error = vfs_statfs_native(path.dentry, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ path_put(&path);
+ }
+ return error;
+}
+
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
+{
+ struct path path;
+ long error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+ error = user_path(pathname, &path);
+ if (!error) {
+ struct statfs64 tmp;
+ error = vfs_statfs64(path.dentry, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ path_put(&path);
+ }
+ return error;
+}
+
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
+{
+ struct file *file;
+ struct statfs tmp;
+ int error;
+
+ error = -EBADF;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ error = vfs_statfs_native(file->f_path.dentry, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ fput(file);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
+{
+ struct file *file;
+ struct statfs64 tmp;
+ int error;
+
+ if (sz != sizeof(*buf))
+ return -EINVAL;
+
+ error = -EBADF;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ error = vfs_statfs64(file->f_path.dentry, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ fput(file);
+out:
+ return error;
+}
+
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
+{
+ struct super_block *s;
+ struct ustat tmp;
+ struct kstatfs sbuf;
+ int err;
+
+ s = user_get_super(new_decode_dev(dev));
+ if (!s)
+ return -EINVAL;
+
+ err = vfs_statfs(s->s_root, &sbuf);
+ drop_super(s);
+ if (err)
+ return err;
+
+ memset(&tmp,0,sizeof(struct ustat));
+ tmp.f_tfree = sbuf.f_bfree;
+ tmp.f_tinode = sbuf.f_ffree;
+
+ return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
+}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee3..69688b15f1f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,15 @@
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/smp_lock.h>
#include <linux/acct.h>
#include <linux/blkdev.h>
#include <linux/quotaops.h>
-#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/vfs.h>
#include <linux/writeback.h> /* for the emergency remount stuff */
#include <linux/idr.h>
-#include <linux/kobject.h>
#include <linux/mutex.h>
-#include <linux/file.h>
#include <linux/backing-dev.h>
-#include <asm/uaccess.h>
#include "internal.h"
@@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type)
* subclass.
*/
down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
- s->s_count = S_BIAS;
+ s->s_count = 1;
atomic_set(&s->s_active, 1);
mutex_init(&s->s_vfs_rename_mutex);
+ lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
mutex_init(&s->s_dquot.dqio_mutex);
mutex_init(&s->s_dquot.dqonoff_mutex);
init_rwsem(&s->s_dquot.dqptr_sem);
@@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s)
/* Superblock refcounting */
/*
- * Drop a superblock's refcount. Returns non-zero if the superblock was
- * destroyed. The caller must hold sb_lock.
+ * Drop a superblock's refcount. The caller must hold sb_lock.
*/
-static int __put_super(struct super_block *sb)
+void __put_super(struct super_block *sb)
{
- int ret = 0;
-
if (!--sb->s_count) {
+ list_del_init(&sb->s_list);
destroy_super(sb);
- ret = 1;
}
- return ret;
-}
-
-/*
- * Drop a superblock's refcount.
- * Returns non-zero if the superblock is about to be destroyed and
- * at least is already removed from super_blocks list, so if we are
- * making a loop through super blocks then we need to restart.
- * The caller must hold sb_lock.
- */
-int __put_super_and_need_restart(struct super_block *sb)
-{
- /* check for race with generic_shutdown_super() */
- if (list_empty(&sb->s_list)) {
- /* super block is removed, need to restart... */
- __put_super(sb);
- return 1;
- }
- /* can't be the last, since s_list is still in use */
- sb->s_count--;
- BUG_ON(sb->s_count == 0);
- return 0;
}
/**
@@ -178,57 +146,48 @@ void put_super(struct super_block *sb)
/**
- * deactivate_super - drop an active reference to superblock
+ * deactivate_locked_super - drop an active reference to superblock
* @s: superblock to deactivate
*
- * Drops an active reference to superblock, acquiring a temprory one if
- * there is no active references left. In that case we lock superblock,
+ * Drops an active reference to superblock, converting it into a temprory
+ * one if there is no other active references left. In that case we
* tell fs driver to shut it down and drop the temporary reference we
* had just acquired.
+ *
+ * Caller holds exclusive lock on superblock; that lock is released.
*/
-void deactivate_super(struct super_block *s)
+void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
- if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
- s->s_count -= S_BIAS-1;
- spin_unlock(&sb_lock);
+ if (atomic_dec_and_test(&s->s_active)) {
vfs_dq_off(s, 0);
- down_write(&s->s_umount);
fs->kill_sb(s);
put_filesystem(fs);
put_super(s);
+ } else {
+ up_write(&s->s_umount);
}
}
-EXPORT_SYMBOL(deactivate_super);
+EXPORT_SYMBOL(deactivate_locked_super);
/**
- * deactivate_locked_super - drop an active reference to superblock
+ * deactivate_super - drop an active reference to superblock
* @s: superblock to deactivate
*
- * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that
- * it does not unlock it until it's all over. As the result, it's safe to
- * use to dispose of new superblock on ->get_sb() failure exits - nobody
- * will see the sucker until it's all over. Equivalent using up_write +
- * deactivate_super is safe for that purpose only if superblock is either
- * safe to use or has NULL ->s_root when we unlock.
+ * Variant of deactivate_locked_super(), except that superblock is *not*
+ * locked by caller. If we are going to drop the final active reference,
+ * lock will be acquired prior to that.
*/
-void deactivate_locked_super(struct super_block *s)
+void deactivate_super(struct super_block *s)
{
- struct file_system_type *fs = s->s_type;
- if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
- s->s_count -= S_BIAS-1;
- spin_unlock(&sb_lock);
- vfs_dq_off(s, 0);
- fs->kill_sb(s);
- put_filesystem(fs);
- put_super(s);
- } else {
- up_write(&s->s_umount);
+ if (!atomic_add_unless(&s->s_active, -1, 1)) {
+ down_write(&s->s_umount);
+ deactivate_locked_super(s);
}
}
-EXPORT_SYMBOL(deactivate_locked_super);
+EXPORT_SYMBOL(deactivate_super);
/**
* grab_super - acquire an active reference
@@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
+ if (atomic_inc_not_zero(&s->s_active)) {
+ spin_unlock(&sb_lock);
+ return 1;
+ }
+ /* it's going away */
s->s_count++;
spin_unlock(&sb_lock);
+ /* wait for it to die */
down_write(&s->s_umount);
- if (s->s_root) {
- spin_lock(&sb_lock);
- if (s->s_count > S_BIAS) {
- atomic_inc(&s->s_active);
- s->s_count--;
- spin_unlock(&sb_lock);
- return 1;
- }
- spin_unlock(&sb_lock);
- }
up_write(&s->s_umount);
put_super(s);
- yield();
return 0;
}
@@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb)
}
spin_lock(&sb_lock);
/* should be initialized for __put_super_and_need_restart() */
- list_del_init(&sb->s_list);
- list_del(&sb->s_instances);
+ list_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
}
@@ -357,6 +310,7 @@ retry:
up_write(&s->s_umount);
destroy_super(s);
}
+ down_write(&old->s_umount);
return old;
}
}
@@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super);
*/
void sync_supers(void)
{
- struct super_block *sb;
+ struct super_block *sb, *n;
spin_lock(&sb_lock);
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
if (sb->s_op->write_super && sb->s_dirt) {
sb->s_count++;
spin_unlock(&sb_lock);
@@ -423,14 +378,43 @@ restart:
up_read(&sb->s_umount);
spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
+ __put_super(sb);
}
}
spin_unlock(&sb_lock);
}
/**
+ * iterate_supers - call function for all active superblocks
+ * @f: function to call
+ * @arg: argument to pass to it
+ *
+ * Scans the superblock list and calls given function, passing it
+ * locked superblock and given argument.
+ */
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+ struct super_block *sb, *n;
+
+ spin_lock(&sb_lock);
+ list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+
+ down_read(&sb->s_umount);
+ if (sb->s_root)
+ f(sb, arg);
+ up_read(&sb->s_umount);
+
+ spin_lock(&sb_lock);
+ __put_super(sb);
+ }
+ spin_unlock(&sb_lock);
+}
+
+/**
* get_super - get the superblock of a device
* @bdev: device to get the superblock for
*
@@ -438,7 +422,7 @@ restart:
* mounted on the device given. %NULL is returned if no match is found.
*/
-struct super_block * get_super(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
{
struct super_block *sb;
@@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev)
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
if (sb->s_bdev == bdev) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
+ /* still alive? */
if (sb->s_root)
return sb;
up_read(&sb->s_umount);
- /* restart only when sb is no longer on the list */
+ /* nope, got unmounted */
spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto rescan;
+ __put_super(sb);
+ goto rescan;
}
}
spin_unlock(&sb_lock);
@@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super);
*
* Scans the superblock list and finds the superblock of the file system
* mounted on the device given. Returns the superblock with an active
- * reference and s_umount held exclusively or %NULL if none was found.
+ * reference or %NULL if none was found.
*/
struct super_block *get_active_super(struct block_device *bdev)
{
@@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev)
if (!bdev)
return NULL;
+restart:
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdev != bdev)
+ if (list_empty(&sb->s_instances))
continue;
-
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_write(&sb->s_umount);
- if (sb->s_root) {
- spin_lock(&sb_lock);
- if (sb->s_count > S_BIAS) {
- atomic_inc(&sb->s_active);
- sb->s_count--;
- spin_unlock(&sb_lock);
+ if (sb->s_bdev == bdev) {
+ if (grab_super(sb)) /* drops sb_lock */
return sb;
- }
- spin_unlock(&sb_lock);
+ else
+ goto restart;
}
- up_write(&sb->s_umount);
- put_super(sb);
- yield();
- spin_lock(&sb_lock);
}
spin_unlock(&sb_lock);
return NULL;
}
-struct super_block * user_get_super(dev_t dev)
+struct super_block *user_get_super(dev_t dev)
{
struct super_block *sb;
spin_lock(&sb_lock);
rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
if (sb->s_dev == dev) {
sb->s_count++;
spin_unlock(&sb_lock);
down_read(&sb->s_umount);
+ /* still alive? */
if (sb->s_root)
return sb;
up_read(&sb->s_umount);
- /* restart only when sb is no longer on the list */
+ /* nope, got unmounted */
spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto rescan;
+ __put_super(sb);
+ goto rescan;
}
}
spin_unlock(&sb_lock);
return NULL;
}
-SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
-{
- struct super_block *s;
- struct ustat tmp;
- struct kstatfs sbuf;
- int err = -EINVAL;
-
- s = user_get_super(new_decode_dev(dev));
- if (s == NULL)
- goto out;
- err = vfs_statfs(s->s_root, &sbuf);
- drop_super(s);
- if (err)
- goto out;
-
- memset(&tmp,0,sizeof(struct ustat));
- tmp.f_tfree = sbuf.f_bfree;
- tmp.f_tinode = sbuf.f_ffree;
-
- err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
-out:
- return err;
-}
-
/**
* do_remount_sb - asks filesystem to change mount options.
* @sb: superblock in question
@@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
static void do_emergency_remount(struct work_struct *work)
{
- struct super_block *sb;
+ struct super_block *sb, *n;
spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
+ list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
+ if (list_empty(&sb->s_instances))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);
down_write(&sb->s_umount);
if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
/*
- * ->remount_fs needs lock_kernel().
- *
* What lock protects sb->s_flags??
*/
do_remount_sb(sb, MS_RDONLY, NULL, 1);
}
up_write(&sb->s_umount);
- put_super(sb);
spin_lock(&sb_lock);
+ __put_super(sb);
}
spin_unlock(&sb_lock);
kfree(work);
@@ -990,6 +945,96 @@ out:
EXPORT_SYMBOL_GPL(vfs_kern_mount);
+/**
+ * freeze_super -- lock the filesystem and force it into a consistent state
+ * @super: the super to lock
+ *
+ * Syncs the super to make sure the filesystem is consistent and calls the fs's
+ * freeze_fs. Subsequent calls to this without first thawing the fs will return
+ * -EBUSY.
+ */
+int freeze_super(struct super_block *sb)
+{
+ int ret;
+
+ atomic_inc(&sb->s_active);
+ down_write(&sb->s_umount);
+ if (sb->s_frozen) {
+ deactivate_locked_super(sb);
+ return -EBUSY;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ sb->s_frozen = SB_FREEZE_TRANS;
+ smp_wmb();
+ up_write(&sb->s_umount);
+ return 0;
+ }
+
+ sb->s_frozen = SB_FREEZE_WRITE;
+ smp_wmb();
+
+ sync_filesystem(sb);
+
+ sb->s_frozen = SB_FREEZE_TRANS;
+ smp_wmb();
+
+ sync_blockdev(sb->s_bdev);
+ if (sb->s_op->freeze_fs) {
+ ret = sb->s_op->freeze_fs(sb);
+ if (ret) {
+ printk(KERN_ERR
+ "VFS:Filesystem freeze failed\n");
+ sb->s_frozen = SB_UNFROZEN;
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ }
+ up_write(&sb->s_umount);
+ return 0;
+}
+EXPORT_SYMBOL(freeze_super);
+
+/**
+ * thaw_super -- unlock filesystem
+ * @sb: the super to thaw
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super().
+ */
+int thaw_super(struct super_block *sb)
+{
+ int error;
+
+ down_write(&sb->s_umount);
+ if (sb->s_frozen == SB_UNFROZEN) {
+ up_write(&sb->s_umount);
+ return -EINVAL;
+ }
+
+ if (sb->s_flags & MS_RDONLY)
+ goto out;
+
+ if (sb->s_op->unfreeze_fs) {
+ error = sb->s_op->unfreeze_fs(sb);
+ if (error) {
+ printk(KERN_ERR
+ "VFS:Filesystem thaw failed\n");
+ sb->s_frozen = SB_FREEZE_TRANS;
+ up_write(&sb->s_umount);
+ return error;
+ }
+ }
+
+out:
+ sb->s_frozen = SB_UNFROZEN;
+ smp_wmb();
+ wake_up(&sb->s_wait_unfrozen);
+ deactivate_locked_super(sb);
+
+ return 0;
+}
+EXPORT_SYMBOL(thaw_super);
+
static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
{
int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7..e8cbd415e50 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
if (wait)
sync_inodes_sb(sb);
else
- writeback_inodes_sb(sb);
+ writeback_inodes_sb_locked(sb);
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(sync_filesystem);
+static void sync_one_sb(struct super_block *sb, void *arg)
+{
+ if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+ __sync_filesystem(sb, *(int *)arg);
+}
/*
* Sync all the data for all the filesystems (called by sys_sync() and
* emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied. s_need_sync
- * is used only here. We set it against all filesystems and then clear it as
- * we sync them. So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything. Fix that with
- * a local mutex.
*/
static void sync_filesystems(int wait)
{
- struct super_block *sb;
- static DEFINE_MUTEX(mutex);
-
- mutex_lock(&mutex); /* Could be down_interruptible */
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list)
- sb->s_need_sync = 1;
-
-restart:
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (!sb->s_need_sync)
- continue;
- sb->s_need_sync = 0;
- sb->s_count++;
- spin_unlock(&sb_lock);
-
- down_read(&sb->s_umount);
- if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
- __sync_filesystem(sb, wait);
- up_read(&sb->s_umount);
-
- /* restart only when sb is no longer on the list */
- spin_lock(&sb_lock);
- if (__put_super_and_need_restart(sb))
- goto restart;
- }
- spin_unlock(&sb_lock);
- mutex_unlock(&mutex);
+ iterate_supers(sync_one_sb, &wait);
}
/*
@@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync);
/**
* vfs_fsync_range - helper to sync a range of data & metadata to disk
* @file: file to sync
- * @dentry: dentry of @file
* @start: offset in bytes of the beginning of data range to sync
* @end: offset in bytes of the end of data range (inclusive)
* @datasync: perform only datasync
@@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync);
* Write back data in range @start..@end and metadata for @file to disk. If
* @datasync is set only metadata needed to access modified file data is
* written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set. This can only happen when the filesystem
- * implements the export_operations API.
*/
-int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
- loff_t end, int datasync)
+int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
- const struct file_operations *fop;
- struct address_space *mapping;
+ struct address_space *mapping = file->f_mapping;
int err, ret;
- /*
- * Get mapping and operations from the file in case we have
- * as file, or get the default values for them in case we
- * don't have a struct file available. Damn nfsd..
- */
- if (file) {
- mapping = file->f_mapping;
- fop = file->f_op;
- } else {
- mapping = dentry->d_inode->i_mapping;
- fop = dentry->d_inode->i_fop;
- }
-
- if (!fop || !fop->fsync) {
+ if (!file->f_op || !file->f_op->fsync) {
ret = -EINVAL;
goto out;
}
@@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
* livelocks in fsync_buffers_list().
*/
mutex_lock(&mapping->host->i_mutex);
- err = fop->fsync(file, dentry, datasync);
+ err = file->f_op->fsync(file, file->f_path.dentry, datasync);
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* @file: file to sync
- * @dentry: dentry of @file
* @datasync: only perform a fdatasync operation
*
* Write back data and metadata for @file to disk. If @datasync is
* set only metadata needed to access modified file data is written.
- *
- * In case this function is called from nfsd @file may be %NULL and
- * only @dentry is set. This can only happen when the filesystem
- * implements the export_operations API.
*/
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync(struct file *file, int datasync)
{
- return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+ return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);
@@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync)
file = fget(fd);
if (file) {
- ret = vfs_fsync(file, file->f_path.dentry, datasync);
+ ret = vfs_fsync(file, datasync);
fput(file);
}
return ret;
@@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
{
if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
return 0;
- return vfs_fsync_range(file, file->f_path.dentry, pos,
- pos + count - 1,
+ return vfs_fsync_range(file, pos, pos + count - 1,
(file->f_flags & __O_SYNC) ? 0 : 1);
}
EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfa..bbd69bdb0fa 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
*sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
dirty_sb(sb);
-
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
- inode->i_uid = current_fsuid();
+ inode_init_owner(inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
insert_inode_hash(inode);
mark_inode_dirty(inode);
- inode->i_mode = mode; /* for sysv_write_inode() */
sysv_write_inode(inode, 0); /* ensure inode not allocated again */
mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
/* That's it. */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a..87ebcce7221 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
*/
inode->i_flags |= (S_NOCMTIME);
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
- inode->i_mode = mode;
+ inode_init_owner(inode, dir, mode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
ubifs_current_time(inode);
inode->i_mapping->nrpages = 0;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3..2b5586c7f02 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
udf_updated_lvid(sb);
}
mutex_unlock(&sbi->s_alloc_mutex);
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else {
- inode->i_gid = current_fsgid();
- }
+
+ inode_init_owner(inode, dir, mode);
iinfo->i_location.logicalBlockNum = block;
iinfo->i_location.partitionReferenceNum =
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95..585f733615d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
inode->i_data.a_ops = &udf_aops;
inode->i_op = &udf_file_inode_operations;
inode->i_fop = &udf_file_operations;
- inode->i_mode = mode;
mark_inode_dirty(inode);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
goto out;
iinfo = UDF_I(inode);
- inode->i_uid = current_fsuid();
init_special_inode(inode, mode, rdev);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi) {
@@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out;
err = -EIO;
- inode = udf_new_inode(dir, S_IFDIR, &err);
+ inode = udf_new_inode(dir, S_IFDIR | mode, &err);
if (!inode)
goto out;
@@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
brelse(fibh.sbh);
- inode->i_mode = S_IFDIR | mode;
- if (dir->i_mode & S_ISGID)
- inode->i_mode |= S_ISGID;
mark_inode_dirty(inode);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
lock_kernel();
- inode = udf_new_inode(dir, S_IFLNK, &err);
+ inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
if (!inode)
goto out;
@@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
}
iinfo = UDF_I(inode);
- inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_data.a_ops = &udf_symlink_aops;
inode->i_op = &udf_symlink_inode_operations;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf60802..3a959d55084 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -303,15 +303,7 @@ cg_found:
sb->s_dirt = 1;
inode->i_ino = cg * uspi->s_ipg + bit;
- inode->i_mode = mode;
- inode->i_uid = current_fsuid();
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
- } else
- inode->i_gid = current_fsgid();
-
+ inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b4..01bb8135e14 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
/*
* Find the xattr_handler with the matching prefix.
*/
-static struct xattr_handler *
-xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+static const struct xattr_handler *
+xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
if (!*name)
return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
ssize_t
generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
- struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
+ const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
unsigned int size = 0;
if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int
generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
if (size == 0)
value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
int
generic_removexattr(struct dentry *dentry, const char *name)
{
- struct xattr_handler *handler;
+ const struct xattr_handler *handler;
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (!handler)
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d6..9f769b5b38f 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
return error;
}
-struct xattr_handler xfs_xattr_acl_access_handler = {
+const struct xattr_handler xfs_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
.flags = ACL_TYPE_ACCESS,
.get = xfs_xattr_acl_get,
.set = xfs_xattr_acl_set,
};
-struct xattr_handler xfs_xattr_acl_default_handler = {
+const struct xattr_handler xfs_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
.get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9002513e08..f24dbe5efde 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -725,7 +725,8 @@ void
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
}
STATIC void
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b..519618e9279 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
-extern struct xattr_handler *xfs_xattr_handlers[];
+extern const struct xattr_handler *xfs_xattr_handlers[];
extern const struct quotactl_ops xfs_quotactl_operations;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6..87d3e03878c 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
(void *)value, size, xflags);
}
-static struct xattr_handler xfs_xattr_user_handler = {
+static const struct xattr_handler xfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
.flags = 0, /* no flags implies user namespace */
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-static struct xattr_handler xfs_xattr_trusted_handler = {
+static const struct xattr_handler xfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.flags = ATTR_ROOT,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-static struct xattr_handler xfs_xattr_security_handler = {
+static const struct xattr_handler xfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.flags = ATTR_SECURE,
.get = xfs_xattr_get,
.set = xfs_xattr_set,
};
-struct xattr_handler *xfs_xattr_handlers[] = {
+const struct xattr_handler *xfs_xattr_handlers[] = {
&xfs_xattr_user_handler,
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f..0135e2a669d 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
extern int posix_acl_access_exists(struct inode *inode);
extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_acl_access_handler;
-extern struct xattr_handler xfs_xattr_acl_default_handler;
+extern const struct xattr_handler xfs_xattr_acl_access_handler;
+extern const struct xattr_handler xfs_xattr_acl_default_handler;
#else
# define xfs_check_acl NULL
# define xfs_get_acl(inode, type) NULL