diff options
Diffstat (limited to 'fs')
172 files changed, 2550 insertions, 2321 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 0ba2db44e0b..4331b3b5ee1 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -256,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) return ERR_PTR(-ENOMEM); } - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/Makefile b/fs/Makefile index 97f340f14ba..e6ec1d309b1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o fs_struct.o + stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/afs/dir.c b/fs/afs/dir.c index adc1cb771b5..b42d5cc1d6d 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index, struct key *key) { struct page *page; - struct file file = { - .private_data = key, - }; - _enter("{%lu},%lu", dir->i_ino, index); - page = read_mapping_page(dir->i_mapping, index, &file); + page = read_cache_page(dir->i_mapping, index, afs_page_filler, key); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) diff --git a/fs/afs/file.c b/fs/afs/file.c index 0df9bc2b724..14d89fa58fe 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page, #endif /* - * AFS read page from file, directory or symlink + * read page from file, directory or symlink, given a key to use */ -static int afs_readpage(struct file *file, struct page *page) +int afs_page_filler(void *data, struct page *page) { - struct afs_vnode *vnode; - struct inode *inode; - struct key *key; + struct inode *inode = page->mapping->host; + struct afs_vnode *vnode = AFS_FS_I(inode); + struct key *key = data; size_t len; off_t offset; int ret; - inode = page->mapping->host; - - if (file) { - key = file->private_data; - ASSERT(key != NULL); - } else { - key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - goto error_nokey; - } - } - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); - vnode = AFS_FS_I(inode); - BUG_ON(!PageLocked(page)); ret = -ESTALE; @@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } - if (!file) - key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); - if (!file) - key_put(key); -error_nokey: _leave(" = %d", ret); return ret; } /* + * read page from file, directory or symlink, given a file to nominate the key + * to be used + */ +static int afs_readpage(struct file *file, struct page *page) +{ + struct key *key; + int ret; + + if (file) { + key = file->private_data; + ASSERT(key != NULL); + ret = afs_page_filler(key, page); + } else { + struct inode *inode = page->mapping->host; + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + } else { + ret = afs_page_filler(key, page); + key_put(key); + } + } + return ret; +} + +/* * read a set of pages */ static int afs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct key *key = file->private_data; struct afs_vnode *vnode; int ret = 0; - _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); + _enter("{%d},{%lu},,%d", + key_serial(key), mapping->host->i_ino, nr_pages); + + ASSERT(key != NULL); vnode = AFS_FS_I(mapping->host); if (vnode->flags & AFS_VNODE_DELETED) { @@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* load the missing pages from the network */ - ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); + ret = read_cache_pages(mapping, pages, afs_page_filler, key); _leave(" = %d [netting]", ret); return ret; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a10f2582844..807f284cc75 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations; extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); +extern int afs_page_filler(void *, struct page *); /* * flock.c diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index b3feddc4f7d..a9e23039ea3 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60; */ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) { - struct file file = { - .private_data = key, - }; struct page *page; size_t size; char *buf; @@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); /* read the contents of the symlink into the pagecache */ - page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); + page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, + afs_page_filler, key); if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index e4b75d6eda8..9bd4b3876c9 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 1e41aadb106..8f73841fc97 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, } set_bit(ino, info->si_imap); info->si_freei--; - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; - inode->i_mode = mode; inode->i_ino = ino; BFS_I(inode)->i_dsk_ino = ino; BFS_I(inode)->i_sblock = 0; diff --git a/fs/block_dev.c b/fs/block_dev.c index 6dcee88c2e5..26e5f502662 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -245,37 +245,14 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb = get_active_super(bdev); if (!sb) goto out; - if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - up_write(&sb->s_umount); + error = freeze_super(sb); + if (error) { + deactivate_super(sb); + bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - sync_filesystem(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - deactivate_locked_super(sb); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } + return ERR_PTR(error); } - up_write(&sb->s_umount); - + deactivate_super(sb); out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); @@ -296,40 +273,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) mutex_lock(&bdev->bd_fsfreeze_mutex); if (!bdev->bd_fsfreeze_count) - goto out_unlock; + goto out; error = 0; if (--bdev->bd_fsfreeze_count > 0) - goto out_unlock; + goto out; if (!sb) - goto out_unlock; - - BUG_ON(sb->s_bdev != bdev); - down_write(&sb->s_umount); - if (sb->s_flags & MS_RDONLY) - goto out_unfrozen; - - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - -out_unfrozen: - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + goto out; - if (sb) - deactivate_locked_super(sb); -out_unlock: + error = thaw_super(sb); + if (error) { + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } +out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return 0; } @@ -417,7 +376,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) */ mutex_unlock(&bd_inode->i_mutex); - error = blkdev_issue_flush(bdev, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); if (error == -EOPNOTSUPP) error = 0; @@ -668,41 +627,209 @@ void bd_forget(struct inode *inode) iput(bdev->bd_inode); } -int bd_claim(struct block_device *bdev, void *holder) +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @whole: whole block device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Test whther @bdev can be claimed by @holder. + * + * CONTEXT: + * spin_lock(&bdev_lock). + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, + void *holder) { - int res; - spin_lock(&bdev_lock); - - /* first decide result */ if (bdev->bd_holder == holder) - res = 0; /* already a holder */ + return true; /* already a holder */ else if (bdev->bd_holder != NULL) - res = -EBUSY; /* held by someone else */ + return false; /* held by someone else */ else if (bdev->bd_contains == bdev) - res = 0; /* is a whole device which isn't held */ + return true; /* is a whole device which isn't held */ - else if (bdev->bd_contains->bd_holder == bd_claim) - res = 0; /* is a partition of a device that is being partitioned */ - else if (bdev->bd_contains->bd_holder != NULL) - res = -EBUSY; /* is a partition of a held device */ + else if (whole->bd_holder == bd_claim) + return true; /* is a partition of a device that is being partitioned */ + else if (whole->bd_holder != NULL) + return false; /* is a partition of a held device */ else - res = 0; /* is a partition of an un-held device */ + return true; /* is a partition of an un-held device */ +} + +/** + * bd_prepare_to_claim - prepare to claim a block device + * @bdev: block device of interest + * @whole: the whole device containing @bdev, may equal @bdev + * @holder: holder trying to claim @bdev + * + * Prepare to claim @bdev. This function fails if @bdev is already + * claimed by another holder and waits if another claiming is in + * progress. This function doesn't actually claim. On successful + * return, the caller has ownership of bd_claiming and bd_holder[s]. + * + * CONTEXT: + * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab + * it multiple times. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +static int bd_prepare_to_claim(struct block_device *bdev, + struct block_device *whole, void *holder) +{ +retry: + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, whole, holder)) + return -EBUSY; + + /* if someone else is claiming, wait for it to finish */ + if (whole->bd_claiming && whole->bd_claiming != holder) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + spin_lock(&bdev_lock); + goto retry; + } + + /* yay, all mine */ + return 0; +} - /* now impose change */ - if (res==0) { +/** + * bd_start_claiming - start claiming a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * + * @bdev is about to be opened exclusively. Check @bdev can be opened + * exclusively and mark that an exclusive open is in progress. Each + * successful call to this function must be matched with a call to + * either bd_claim() or bd_abort_claiming(). If this function + * succeeds, the matching bd_claim() is guaranteed to succeed. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to the block device containing @bdev on success, ERR_PTR() + * value on failure. + */ +static struct block_device *bd_start_claiming(struct block_device *bdev, + void *holder) +{ + struct gendisk *disk; + struct block_device *whole; + int partno, err; + + might_sleep(); + + /* + * @bdev might not have been initialized properly yet, look up + * and grab the outer block device the hard way. + */ + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + return ERR_PTR(-ENXIO); + + whole = bdget_disk(disk, 0); + put_disk(disk); + if (!whole) + return ERR_PTR(-ENOMEM); + + /* prepare to claim, if successful, mark claiming in progress */ + spin_lock(&bdev_lock); + + err = bd_prepare_to_claim(bdev, whole, holder); + if (err == 0) { + whole->bd_claiming = holder; + spin_unlock(&bdev_lock); + return whole; + } else { + spin_unlock(&bdev_lock); + bdput(whole); + return ERR_PTR(err); + } +} + +/* releases bdev_lock */ +static void __bd_abort_claiming(struct block_device *whole, void *holder) +{ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); + + spin_unlock(&bdev_lock); + bdput(whole); +} + +/** + * bd_abort_claiming - abort claiming a block device + * @whole: whole block device returned by bd_start_claiming() + * @holder: holder trying to claim @bdev + * + * Abort a claiming block started by bd_start_claiming(). Note that + * @whole is not the block device to be claimed but the whole device + * returned by bd_start_claiming(). + * + * CONTEXT: + * Grabs and releases bdev_lock. + */ +static void bd_abort_claiming(struct block_device *whole, void *holder) +{ + spin_lock(&bdev_lock); + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ +} + +/** + * bd_claim - claim a block device + * @bdev: block device to claim + * @holder: holder trying to claim @bdev + * + * Try to claim @bdev which must have been opened successfully. This + * function may be called with or without preceding + * blk_start_claiming(). In the former case, this function is always + * successful and terminates the claiming block. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 if successful, -EBUSY if @bdev is already claimed. + */ +int bd_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev->bd_contains; + int res; + + might_sleep(); + + spin_lock(&bdev_lock); + + res = bd_prepare_to_claim(bdev, whole, holder); + if (res == 0) { /* note that for a whole device bd_holders * will be incremented twice, and bd_holder will * be set to bd_claim before being set to holder */ - bdev->bd_contains->bd_holders ++; - bdev->bd_contains->bd_holder = bd_claim; + whole->bd_holders++; + whole->bd_holder = bd_claim; bdev->bd_holders++; bdev->bd_holder = holder; } - spin_unlock(&bdev_lock); + + if (whole->bd_claiming) + __bd_abort_claiming(whole, holder); /* releases bdev_lock */ + else + spin_unlock(&bdev_lock); + return res; } - EXPORT_SYMBOL(bd_claim); void bd_release(struct block_device *bdev) @@ -1316,6 +1443,7 @@ EXPORT_SYMBOL(blkdev_get); static int blkdev_open(struct inode * inode, struct file * filp) { + struct block_device *whole = NULL; struct block_device *bdev; int res; @@ -1338,22 +1466,25 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; + if (filp->f_mode & FMODE_EXCL) { + whole = bd_start_claiming(bdev, filp); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + filp->f_mapping = bdev->bd_inode->i_mapping; res = blkdev_get(bdev, filp->f_mode); - if (res) - return res; - if (filp->f_mode & FMODE_EXCL) { - res = bd_claim(bdev, filp); - if (res) - goto out_blkdev_put; + if (whole) { + if (res == 0) + BUG_ON(bd_claim(bdev, filp) != 0); + else + bd_abort_claiming(whole, filp); } - return 0; - - out_blkdev_put: - blkdev_put(bdev, filp->f_mode); return res; } @@ -1564,27 +1695,34 @@ EXPORT_SYMBOL(lookup_bdev); */ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) { - struct block_device *bdev; - int error = 0; + struct block_device *bdev, *whole; + int error; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return whole; + } + error = blkdev_get(bdev, mode); if (error) - return ERR_PTR(error); + goto out_abort_claiming; + error = -EACCES; if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto blkdev_put; - error = bd_claim(bdev, holder); - if (error) - goto blkdev_put; + goto out_blkdev_put; + BUG_ON(bd_claim(bdev, holder) != 0); return bdev; - -blkdev_put: + +out_blkdev_put: blkdev_put(bdev, mode); +out_abort_claiming: + bd_abort_claiming(whole, holder); return ERR_PTR(error); } diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ef7b26724e..8d432cd9d58 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -282,14 +282,14 @@ int btrfs_acl_chmod(struct inode *inode) return ret; } -struct xattr_handler btrfs_xattr_acl_default_handler = { +const struct xattr_handler btrfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = btrfs_xattr_acl_get, .set = btrfs_xattr_acl_set, }; -struct xattr_handler btrfs_xattr_acl_access_handler = { +const struct xattr_handler btrfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = btrfs_xattr_acl_get, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b34d32fdaae..c6a4f459ad7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1589,7 +1589,7 @@ static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2bfdc641d4e..d601629b85d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4121,16 +4121,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail; - inode->i_uid = current_fsuid(); - - if (dir && (dir->i_mode & S_ISGID)) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = objectid; inode_set_bytes(inode, 0); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 193b58f7d3f..59acd3eb288 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -282,7 +282,7 @@ err: * List of handlers for synthetic system.* attributes. All real ondisk * attributes are handled directly. */ -struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler *btrfs_xattr_handlers[] = { #ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 721efa0346e..7a43fd640bb 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -21,9 +21,9 @@ #include <linux/xattr.h> -extern struct xattr_handler btrfs_xattr_acl_access_handler; -extern struct xattr_handler btrfs_xattr_acl_default_handler; -extern struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler btrfs_xattr_acl_access_handler; +extern const struct xattr_handler btrfs_xattr_acl_default_handler; +extern const struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/buffer.c b/fs/buffer.c index c9c266db062..e8aa7081d25 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev) return; invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } EXPORT_SYMBOL(invalidate_bdev); @@ -560,26 +561,17 @@ repeat: return err; } -static void do_thaw_all(struct work_struct *work) +static void do_thaw_one(struct super_block *sb, void *unused) { - struct super_block *sb; char b[BDEVNAME_SIZE]; + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); +} - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) - printk(KERN_WARNING "Emergency Thaw on %s\n", - bdevname(sb->s_bdev, b)); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); +static void do_thaw_all(struct work_struct *work) +{ + iterate_supers(do_thaw_one, NULL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a9005d862ed..d9c60b84949 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; int rc = 0; struct page **pages; - struct pagevec pvec; loff_t offset; u64 len; @@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc < 0) goto out; - /* set uptodate and add to lru in pagevec-sized chunks */ - pagevec_init(&pvec, 0); for (; !list_empty(page_list) && len > 0; rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { struct page *page = @@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, zero_user_segment(page, s, PAGE_CACHE_SIZE); } - if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { + if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { page_cache_release(page); dout("readpages %p add_to_page_cache failed %p\n", inode, page); @@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); - if (pagevec_add(&pvec, page) == 0) - pagevec_lru_add_file(&pvec); /* add to lru */ + page_cache_release(page); } - pagevec_lru_add_file(&pvec); rc = 0; out: @@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req, ceph_release_pages(req->r_pages, req->r_num_pages); if (req->r_pages_from_pool) mempool_free(req->r_pages, - ceph_client(inode->i_sb)->wb_pagevec_pool); + ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); else kfree(req->r_pages); ceph_osdc_put_request(req); diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index 818afe72e6c..9f46de2ba7a 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -150,7 +150,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac, ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building request\n", ret); + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); return ret; } dout(" built request %d bytes\n", ret); @@ -216,8 +217,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ac->protocol != protocol) { ret = ceph_auth_init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("error %d on auth method %s init\n", + ret, ac->ops->name); goto out; } } @@ -229,7 +230,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, if (ret == -EAGAIN) { return ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { - pr_err("authentication error %d\n", ret); + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); return ret; } return 0; diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h index ca4f57cfb26..4429a707c02 100644 --- a/fs/ceph/auth.h +++ b/fs/ceph/auth.h @@ -15,6 +15,8 @@ struct ceph_auth_client; struct ceph_authorizer; struct ceph_auth_client_ops { + const char *name; + /* * true if we are authenticated and can connect to * services. diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c index 8cd9e3af07f..24407c11929 100644 --- a/fs/ceph/auth_none.c +++ b/fs/ceph/auth_none.c @@ -94,6 +94,7 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, } static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index fee5a08da88..7b206231566 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -127,7 +127,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, int ret; char *dbuf; char *ticket_buf; - u8 struct_v; + u8 reply_struct_v; dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) @@ -139,14 +139,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, goto out_dbuf; ceph_decode_need(&p, end, 1 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + reply_struct_v = ceph_decode_8(&p); + if (reply_struct_v != 1) goto bad; num = ceph_decode_32(&p); dout("%d tickets\n", num); while (num--) { int type; - u8 struct_v; + u8 tkt_struct_v, blob_struct_v; struct ceph_x_ticket_handler *th; void *dp, *dend; int dlen; @@ -165,8 +165,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, type = ceph_decode_32(&p); dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); - struct_v = ceph_decode_8(&p); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&p); + if (tkt_struct_v != 1) goto bad; th = get_ticket_handler(ac, type); @@ -186,8 +186,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, dend = dbuf + dlen; dp = dbuf; - struct_v = ceph_decode_8(&dp); - if (struct_v != 1) + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) goto bad; memcpy(&old_key, &th->session_key, sizeof(old_key)); @@ -224,7 +224,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, tpend = tp + dlen; dout(" ticket blob is %d bytes\n", dlen); ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); - struct_v = ceph_decode_8(&tp); + blob_struct_v = ceph_decode_8(&tp); new_secret_id = ceph_decode_64(&tp); ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); if (ret) @@ -618,6 +618,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", .is_authenticated = ceph_x_is_authenticated, .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d9400534b27..0dd0b81e64f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; int removed = 0; dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); @@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); + if (!msg) + return -ENOMEM; msg->hdr.tid = cpu_to_le64(flush_tid); @@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) */ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) { - struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct inode *inode = &ci->vfs_inode; int was = ci->i_dirty_caps; int dirty = 0; @@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) static int __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing; @@ -1663,7 +1665,7 @@ ack: static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, unsigned *flush_tid) { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int unlock_session = session ? 0 : 1; int flushing = 0; @@ -1716,10 +1718,9 @@ out_unlocked: static int caps_are_flushed(struct inode *inode, unsigned tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int dirty, i, ret = 1; + int i, ret = 1; spin_lock(&inode->i_lock); - dirty = __ceph_caps_dirty(ci); for (i = 0; i < CEPH_CAP_BITS; i++) if ((ci->i_flushing_caps & (1 << i)) && ci->i_cap_flush_tid[i] <= tid) { @@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; spin_lock(&inode->i_lock); if (__ceph_caps_dirty(ci)) @@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, __releases(inode->i_lock) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index 0c2241ef365..3b9eeed097b 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h @@ -19,7 +19,7 @@ * Ceph release version */ #define CEPH_VERSION_MAJOR 0 -#define CEPH_VERSION_MINOR 19 +#define CEPH_VERSION_MINOR 20 #define CEPH_VERSION_PATCH 0 #define _CEPH_STRINGIFY(x) #x @@ -36,7 +36,7 @@ * client-facing protocol. */ #define CEPH_OSD_PROTOCOL 8 /* cluster internal */ -#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ +#define CEPH_MDS_PROTOCOL 12 /* cluster internal */ #define CEPH_MON_PROTOCOL 5 /* cluster internal */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */ @@ -53,8 +53,18 @@ /* * feature bits */ -#define CEPH_FEATURE_SUPPORTED 0 -#define CEPH_FEATURE_REQUIRED 0 +#define CEPH_FEATURE_UID 1 +#define CEPH_FEATURE_NOSRCADDR 2 +#define CEPH_FEATURE_FLOCK 4 + +#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK +#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID +#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR +#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR /* @@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_AUTH_NONE 0x1 #define CEPH_AUTH_CEPHX 0x2 +#define CEPH_AUTH_UID_DEFAULT ((__u64) -1) + /********************************************* * message layer @@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); #define CEPH_MSG_CLIENT_SNAP 0x312 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 +/* pool ops */ +#define CEPH_MSG_POOLOP_REPLY 48 +#define CEPH_MSG_POOLOP 49 + + /* osd */ #define CEPH_MSG_OSD_MAP 41 #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 +/* pool operations */ +enum { + POOL_OP_CREATE = 0x01, + POOL_OP_DELETE = 0x02, + POOL_OP_AUID_CHANGE = 0x03, + POOL_OP_CREATE_SNAP = 0x11, + POOL_OP_DELETE_SNAP = 0x12, + POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, + POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, +}; + struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); +const char *ceph_pool_op_name(int op); + +struct ceph_mon_poolop { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 pool; + __le32 op; + __le64 auid; + __le64 snapid; + __le32 name_len; +} __attribute__ ((packed)); + +struct ceph_mon_poolop_reply { + struct ceph_mon_request_header monhdr; + struct ceph_fsid fsid; + __le32 reply_code; + __le32 epoch; + char has_data; + char data[0]; +} __attribute__ ((packed)); + +struct ceph_mon_unmanaged_snap { + __le64 snapid; +} __attribute__ ((packed)); + struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; @@ -308,6 +361,7 @@ union ceph_mds_request_args { struct { __le32 frag; /* which dir fragment */ __le32 max_entries; /* how many dentries to grab */ + __le32 max_bytes; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c index 8e4be6a80c6..7503aee828c 100644 --- a/fs/ceph/ceph_strings.c +++ b/fs/ceph/ceph_strings.c @@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type) case CEPH_ENTITY_TYPE_OSD: return "osd"; case CEPH_ENTITY_TYPE_MON: return "mon"; case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; case CEPH_ENTITY_TYPE_AUTH: return "auth"; default: return "unknown"; } @@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op) case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; case CEPH_OSD_OP_RMXATTR: return "rmxattr"; + case CEPH_OSD_OP_CMPXATTR: return "cmpxattr"; case CEPH_OSD_OP_PULL: return "pull"; case CEPH_OSD_OP_PUSH: return "push"; @@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o) } return "???"; } + +const char *ceph_pool_op_name(int op) +{ + switch (op) { + case POOL_OP_CREATE: return "create"; + case POOL_OP_DELETE: return "delete"; + case POOL_OP_AUID_CHANGE: return "auid change"; + case POOL_OP_CREATE_SNAP: return "create snap"; + case POOL_OP_DELETE_SNAP: return "delete snap"; + case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; + case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; + } + return "???"; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f7048da92ac..3be33fb066c 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p) static int monc_show(struct seq_file *s, void *p) { struct ceph_client *client = s->private; - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_client *monc = &client->monc; struct rb_node *rp; @@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p) if (monc->want_next_osdmap) seq_printf(s, "want next osdmap\n"); - for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { - req = rb_entry(rp, struct ceph_mon_statfs_request, node); - seq_printf(s, "%lld statfs\n", req->tid); + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%lld statfs\n", req->tid); + else + seq_printf(s, "%lld unknown\n", req->tid); } mutex_unlock(&monc->mutex); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 650d2db5ed2..4fd30900eff 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry) return -ENOMEM; /* oh well */ spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) /* lost a race */ + if (dentry->d_fsdata) { + /* lost a race */ + kmem_cache_free(ceph_dentry_cachep, di); goto out_unlock; + } di->dentry = dentry; di->lease_session = NULL; dentry->d_fsdata = di; @@ -125,7 +128,8 @@ more: dentry = list_entry(p, struct dentry, d_u.d_child); di = ceph_dentry(dentry); while (1) { - dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, + dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, + d_unhashed(dentry) ? "!hashed" : "hashed", parent->d_subdirs.prev, parent->d_subdirs.next); if (p == &parent->d_subdirs) { fi->at_end = 1; @@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; const int max_entries = client->mount_args->max_readdir; + const int max_bytes = client->mount_args->max_readdir_bytes; dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); if (fi->at_end) @@ -312,6 +317,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); + req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes); req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { @@ -335,7 +341,7 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 0; + fi->next_offset = 2; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; /* .snap dir? */ @@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, !is_root_ceph_dentry(dir, dentry) && (ci->i_ceph_flags & CEPH_I_COMPLETE) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { - di->offset = ci->i_max_offset++; spin_unlock(&dir->i_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - new_dentry->d_time = jiffies; - ceph_dentry(new_dentry)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(new_dentry); } ceph_mdsc_put_request(req); return err; } +/* + * Ensure a dentry lease will no longer revalidate. + */ +void ceph_invalidate_dentry_lease(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + ceph_dentry(dentry)->lease_shared_gen = 0; + spin_unlock(&dentry->d_lock); +} /* * Check if dentry lease is valid. If not, delete the lease. Try to @@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir = dentry->d_parent->d_inode; - dout("d_revalidate %p '%.*s' inode %p\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, + dentry->d_name.len, dentry->d_name.name, dentry->d_inode, + ceph_dentry(dentry)->offset); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { @@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, struct ceph_inode_info *ci = ceph_inode(inode); int left; - if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) + if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) return -EISDIR; if (!cf->dir_info) { @@ -1152,7 +1167,7 @@ void ceph_dentry_lru_add(struct dentry *dn) dout("dentry_lru_add %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); mdsc->num_dentry++; @@ -1165,10 +1180,10 @@ void ceph_dentry_lru_touch(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, + dn->d_name.len, dn->d_name.name, di->offset); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); spin_unlock(&mdsc->dentry_lru_lock); @@ -1183,7 +1198,7 @@ void ceph_dentry_lru_del(struct dentry *dn) dout("dentry_lru_del %p %p '%.*s'\n", di, dn, dn->d_name.len, dn->d_name.name); if (di) { - mdsc = &ceph_client(dn->d_sb)->mdsc; + mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); mdsc->num_dentry--; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 9d67572fb32..17447644d67 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", fh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); @@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, static struct dentry *__cfh_to_dentry(struct super_block *sb, struct ceph_nfs_confh *cfh) { - struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; struct inode *inode; struct dentry *dentry; struct ceph_vino vino; @@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, } dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { @@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, return ERR_PTR(-ESTALE); dentry = d_obtain_alias(inode); - if (!dentry) { + if (IS_ERR(dentry)) { pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", cfh->ino, inode); iput(inode); - return ERR_PTR(-ENOMEM); + return dentry; } err = ceph_init_dentry(dentry); if (err < 0) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ed6f19721d6..6512b6701b9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages) /* * allocate a vector new pages */ -static struct page **alloc_page_vector(int num_pages) +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) { struct page **pages; int i; - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * num_pages, flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(flags); if (pages[i] == NULL) { ceph_release_page_vector(pages, i); return ERR_PTR(-ENOMEM); @@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, * in sequence. */ } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); } if (IS_ERR(pages)) return PTR_ERR(pages); @@ -649,8 +649,8 @@ more: do_sync, ci->i_truncate_seq, ci->i_truncate_size, &mtime, false, 2); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; num_pages = calc_pages_for(pos, len); @@ -668,7 +668,7 @@ more: truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_CACHE_SIZE-1)); } else { - pages = alloc_page_vector(num_pages); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; loff_t endoff = pos + iov->iov_len; int got = 0; int ret, err; @@ -844,8 +844,7 @@ retry_snap: if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, file->f_path.dentry, - pos, pos + ret - 1, 1); + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); if (err < 0) ret = err; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 85b4d2ffdeb..a81b8b662c7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - &ceph_client(ci->vfs_inode.i_sb)->mdsc; + &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", realm); @@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode, memcpy(ci->i_xattrs.blob->vec.iov_base, iinfo->xattr_data, iinfo->xattr_len); ci->i_xattrs.version = le64_to_cpu(info->xattr_version); + xattr_blob = NULL; } inode->i_mapping->a_ops = &ceph_aops; inode->i_mapping->backing_dev_info = - &ceph_client(inode->i_sb)->backing_dev_info; + &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode, /* set dir completion flag? */ if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { + (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; ci->i_max_offset = 2; } /* it may be better to set st_size in getattr instead? */ - if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) + if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) inode->i_size = ci->i_rbytes; break; default: @@ -802,6 +804,37 @@ out_unlock: } /* + * Set dentry's directory position based on the current dir's max, and + * order it in d_subdirs, so that dcache_readdir behaves. + */ +static void ceph_set_dentry_offset(struct dentry *dn) +{ + struct dentry *dir = dn->d_parent; + struct inode *inode = dn->d_parent->d_inode; + struct ceph_dentry_info *di; + + BUG_ON(!inode); + + di = ceph_dentry(dn); + + spin_lock(&inode->i_lock); + if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) { + spin_unlock(&inode->i_lock); + return; + } + di->offset = ceph_inode(inode)->i_max_offset++; + spin_unlock(&inode->i_lock); + + spin_lock(&dcache_lock); + spin_lock(&dn->d_lock); + list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); + dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, + dn->d_u.d_child.prev, dn->d_u.d_child.next); + spin_unlock(&dn->d_lock); + spin_unlock(&dcache_lock); +} + +/* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. * @@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; + BUG_ON(dn->d_inode); + /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); @@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); - dout("dn %p attached to %p ino %llx.%llx\n", dn, dn->d_inode, ceph_vinop(dn->d_inode)); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); + ceph_set_dentry_offset(dn); out: return dn; } /* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dn->d_parent->d_inode; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - di = ceph_dentry(dn); - - spin_lock(&inode->i_lock); - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&inode->i_lock); - - spin_lock(&dcache_lock); - spin_lock(&dn->d_lock); - list_move_tail(&dir->d_subdirs, &dn->d_u.d_child); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dcache_lock); -} - -/* * Incorporate results into the local cache. This is either just * one inode, or a directory, dentry, and possibly linked-to inode (e.g., * after a lookup). @@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, if (!rinfo->head->is_target && !rinfo->head->is_dentry) { dout("fill_trace reply is empty!\n"); - if (rinfo->head->result == 0 && req->r_locked_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); - dout(" clearing %p complete (empty trace)\n", - req->r_locked_dir); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } + if (rinfo->head->result == 0 && req->r_locked_dir) + ceph_invalidate_dir_request(req); return 0; } @@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, req->r_old_dentry->d_name.len, req->r_old_dentry->d_name.name, dn, dn->d_name.len, dn->d_name.name); + /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ - dn->d_time = jiffies; - ceph_dentry(dn)->lease_shared_gen = 0; + ceph_invalidate_dentry_lease(dn); + /* take overwritten dentry's readdir offset */ + dout("dn %p gets %p offset %lld (old offset %lld)\n", + req->r_old_dentry, dn, ceph_dentry(dn)->offset, + ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset = ceph_dentry(dn)->offset; + dn = req->r_old_dentry; /* use old_dentry */ in = dn->d_inode; } @@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - ceph_set_dentry_offset(dn); igrab(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { @@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, err = PTR_ERR(dn); goto done; } - ceph_set_dentry_offset(dn); req->r_dentry = dn; /* may have spliced */ igrab(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ @@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - if (queue_work(ceph_client(inode->i_sb)->trunc_wq, + if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); igrab(inode); @@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) struct inode *parent_inode = dentry->d_parent->d_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; - struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; int issued; int release = 0, dirtied = 0; int mask = 0; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8a5bcae6284..d085f07756b 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_ioctl_dataloc dl; struct inode *inode = file->f_dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; + struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; u64 len = 1, olen; u64 tmp; struct ceph_object_layout ol; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 24561a557e0..885aa5710cf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -40,7 +40,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); -const static struct ceph_connection_operations mds_con_ops; +static const struct ceph_connection_operations mds_con_ops; /* @@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) struct ceph_msg *msg; struct ceph_mds_session_head *h; - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); + if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } h = msg->front.iov_base; h->op = cpu_to_le32(op); @@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int mstate; int mds = session->s_mds; - int err = 0; /* wait for mds to go active? */ mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); @@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc, /* send connect message */ msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); - if (IS_ERR(msg)) { - err = PTR_ERR(msg); - goto out; - } + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); - -out: return 0; } @@ -804,12 +799,49 @@ out: } static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, - void *arg) + void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + int drop = 0; + dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); - ceph_remove_cap(cap); + spin_lock(&inode->i_lock); + __ceph_remove_cap(cap); + if (!__ceph_is_any_real_caps(ci)) { + struct ceph_mds_client *mdsc = + &ceph_sb_to_client(inode->i_sb)->mdsc; + + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + pr_info(" dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + drop = 1; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_info(" dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + drop = 1; + } + if (drop && ci->i_wrbuffer_ref) { + pr_info(" dropping dirty data for %p %lld\n", + inode, ceph_ino(inode)); + ci->i_wrbuffer_ref = 0; + ci->i_wrbuffer_ref_head = 0; + drop++; + } + spin_unlock(&mdsc->cap_dirty_lock); + } + spin_unlock(&inode->i_lock); + while (drop--) + iput(inode); return 0; } @@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session) dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, NULL); BUG_ON(session->s_nr_caps > 0); + BUG_ON(!list_empty(&session->s_cap_flushing)); cleanup_cap_releases(session); } @@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, ceph_mds_state_name(state)); msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, ++session->s_renew_seq); - if (IS_ERR(msg)) - return PTR_ERR(msg); + if (!msg) + return -ENOMEM; ceph_con_send(&session->s_con, msg); return 0; } @@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *msg; - int err = 0; dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, session_state_name(session->s_state), session->s_seq); msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); - if (IS_ERR(msg)) - err = PTR_ERR(msg); - else - ceph_con_send(&session->s_con, msg); - return err; + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; } /* @@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc, while (session->s_num_cap_releases < session->s_nr_caps + extra) { spin_unlock(&session->s_cap_lock); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - 0, 0, NULL); + GFP_NOFS); if (!msg) goto out_unlocked; dout("add_cap_releases %p msg %p now %d\n", session, msg, @@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("send_cap_releases mds%d\n", session->s_mds); - while (1) { - spin_lock(&session->s_cap_lock); - if (list_empty(&session->s_cap_releases_done)) - break; + spin_lock(&session->s_cap_lock); + while (!list_empty(&session->s_cap_releases_done)) { msg = list_first_entry(&session->s_cap_releases_done, struct ceph_msg, list_head); list_del_init(&msg->list_head); @@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc, msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); + spin_lock(&session->s_cap_lock); } spin_unlock(&session->s_cap_lock); } +static void discard_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_msg *msg; + struct ceph_mds_cap_release *head; + unsigned num; + + dout("discard_cap_releases mds%d\n", session->s_mds); + spin_lock(&session->s_cap_lock); + + /* zero out the in-progress message */ + msg = list_first_entry(&session->s_cap_releases, + struct ceph_msg, list_head); + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num); + head->num = cpu_to_le32(0); + session->s_num_cap_releases += num; + + /* requeue completed messages */ + while (!list_empty(&session->s_cap_releases_done)) { + msg = list_first_entry(&session->s_cap_releases_done, + struct ceph_msg, list_head); + list_del_init(&msg->list_head); + + head = msg->front.iov_base; + num = le32_to_cpu(head->num); + dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, + num); + session->s_num_cap_releases += num; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + list_add(&msg->list_head, &session->s_cap_releases); + } + + spin_unlock(&session->s_cap_lock); +} + /* * requests */ @@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) if (!req) return ERR_PTR(-ENOMEM); + mutex_init(&req->r_fill_mutex); req->r_started = jiffies; req->r_resend_mds = -1; INIT_LIST_HEAD(&req->r_unsafe_dir_item); @@ -1251,7 +1320,7 @@ retry: len += 1 + temp->d_name.len; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry %p\n", dentry); + pr_err("build_path corrupt dentry %p\n", dentry); return ERR_PTR(-EINVAL); } } @@ -1267,7 +1336,7 @@ retry: struct inode *inode = temp->d_inode; if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { - dout("build_path_dentry path+%d: %p SNAPDIR\n", + dout("build_path path+%d: %p SNAPDIR\n", pos, temp); } else if (stop_on_nosnap && inode && ceph_snap(inode) == CEPH_NOSNAP) { @@ -1278,20 +1347,18 @@ retry: break; strncpy(path + pos, temp->d_name.name, temp->d_name.len); - dout("build_path_dentry path+%d: %p '%.*s'\n", - pos, temp, temp->d_name.len, path + pos); } if (pos) path[--pos] = '/'; temp = temp->d_parent; if (temp == NULL) { - pr_err("build_path_dentry corrupt dentry\n"); + pr_err("build_path corrupt dentry\n"); kfree(path); return ERR_PTR(-EINVAL); } } if (pos != 0) { - pr_err("build_path_dentry did not end path lookup where " + pr_err("build_path did not end path lookup where " "expected, namelen is %d, pos is %d\n", len, pos); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not @@ -1303,7 +1370,7 @@ retry: *base = ceph_ino(temp->d_inode); *plen = len; - dout("build_path_dentry on %p %d built %llx '%.*s'\n", + dout("build_path on %p %d built %llx '%.*s'\n", dentry, atomic_read(&dentry->d_count), *base, len, path); return path; } @@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_old_dentry_drop) len += req->r_old_dentry->d_name.len; - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); + if (!msg) { + msg = ERR_PTR(-ENOMEM); goto out_free2; + } msg->hdr.tid = cpu_to_le64(req->r_tid); @@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, } msg = create_request_message(mdsc, req, mds); if (IS_ERR(msg)) { - req->r_reply = ERR_PTR(PTR_ERR(msg)); + req->r_err = PTR_ERR(msg); complete_request(mdsc, req); - return -PTR_ERR(msg); + return PTR_ERR(msg); } req->r_request = msg; @@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_reply) + if (req->r_err || req->r_got_result) goto out; if (req->r_timeout && @@ -1609,7 +1678,7 @@ out: return err; finish: - req->r_reply = ERR_PTR(err); + req->r_err = err; complete_request(mdsc, req); goto out; } @@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc, /* * Wake up threads with requests pending for @mds, so that they can - * resubmit their requests to a possibly different mds. If @all is set, - * wake up if their requests has been forwarded to @mds, too. + * resubmit their requests to a possibly different mds. */ -static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) +static void kick_requests(struct ceph_mds_client *mdsc, int mds) { struct ceph_mds_request *req; struct rb_node *p; @@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, __register_request(mdsc, req, dir); __do_request(mdsc, req); - /* wait */ - if (!req->r_reply) { - mutex_unlock(&mdsc->mutex); - if (req->r_timeout) { - err = (long)wait_for_completion_interruptible_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - req->r_reply = ERR_PTR(-EIO); - else if (err < 0) - req->r_reply = ERR_PTR(err); - } else { - err = wait_for_completion_interruptible( - &req->r_completion); - if (err) - req->r_reply = ERR_PTR(err); - } - mutex_lock(&mdsc->mutex); + if (req->r_err) { + err = req->r_err; + __unregister_request(mdsc, req); + dout("do_request early error %d\n", err); + goto out; } - if (IS_ERR(req->r_reply)) { - err = PTR_ERR(req->r_reply); - req->r_reply = NULL; + /* wait */ + mutex_unlock(&mdsc->mutex); + dout("do_request waiting\n"); + if (req->r_timeout) { + err = (long)wait_for_completion_interruptible_timeout( + &req->r_completion, req->r_timeout); + if (err == 0) + err = -EIO; + } else { + err = wait_for_completion_interruptible(&req->r_completion); + } + dout("do_request waited, got %d\n", err); + mutex_lock(&mdsc->mutex); - if (err == -ERESTARTSYS) { - /* aborted */ - req->r_aborted = true; + /* only abort if we didn't race with a real reply */ + if (req->r_got_result) { + err = le32_to_cpu(req->r_reply_info.head->result); + } else if (err < 0) { + dout("aborted request %lld with %d\n", req->r_tid, err); - if (req->r_locked_dir && - (req->r_op & CEPH_MDS_OP_WRITE)) { - struct ceph_inode_info *ci = - ceph_inode(req->r_locked_dir); + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + req->r_aborted = true; + mutex_unlock(&req->r_fill_mutex); - dout("aborted, clearing I_COMPLETE on %p\n", - req->r_locked_dir); - spin_lock(&req->r_locked_dir->i_lock); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - spin_unlock(&req->r_locked_dir->i_lock); - } - } else { - /* clean up this request */ - __unregister_request(mdsc, req); - if (!list_empty(&req->r_unsafe_item)) - list_del_init(&req->r_unsafe_item); - complete(&req->r_safe_completion); - } - } else if (req->r_err) { - err = req->r_err; + if (req->r_locked_dir && + (req->r_op & CEPH_MDS_OP_WRITE)) + ceph_invalidate_dir_request(req); } else { - err = le32_to_cpu(req->r_reply_info.head->result); + err = req->r_err; } - mutex_unlock(&mdsc->mutex); +out: + mutex_unlock(&mdsc->mutex); dout("do_request %p done, result %d\n", req, err); return err; } /* + * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS + * namespace request. + */ +void ceph_invalidate_dir_request(struct ceph_mds_request *req) +{ + struct inode *inode = req->r_locked_dir; + struct ceph_inode_info *ci = ceph_inode(inode); + + dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode); + spin_lock(&inode->i_lock); + ci->i_ceph_flags &= ~CEPH_I_COMPLETE; + ci->i_release_count++; + spin_unlock(&inode->i_lock); + + if (req->r_dentry) + ceph_invalidate_dentry_lease(req->r_dentry); + if (req->r_old_dentry) + ceph_invalidate_dentry_lease(req->r_old_dentry); +} + +/* * Handle mds reply. * * We take the session mutex and parse and process the reply immediately. @@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } + if (req->r_got_safe && !head->safe) { + pr_warning("got unsafe after safe on %llu from mds%d\n", + tid, mds); + mutex_unlock(&mdsc->mutex); + goto out; + } result = le32_to_cpu(head->result); @@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&mdsc->mutex); goto out; } - } - - BUG_ON(req->r_reply); - - if (!head->safe) { + } else { req->r_got_unsafe = true; list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); } @@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* insert trace into our cache */ + mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); if (err == 0) { if (result == 0 && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(&req->r_caps_reservation); } + mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); out_err: - if (err) { - req->r_err = err; + mutex_lock(&mdsc->mutex); + if (!req->r_aborted) { + if (err) { + req->r_err = err; + } else { + req->r_reply = msg; + ceph_msg_get(msg); + req->r_got_result = true; + } } else { - req->r_reply = msg; - ceph_msg_get(msg); + dout("reply arrived after request %lld was aborted\n", tid); } + mutex_unlock(&mdsc->mutex); add_cap_releases(mdsc, req->r_session, -1); mutex_unlock(&session->s_mutex); @@ -1984,6 +2077,8 @@ static void handle_session(struct ceph_mds_session *session, switch (op) { case CEPH_SESSION_OPEN: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; renewed_caps(mdsc, session, 0); wake = 1; @@ -1997,10 +2092,12 @@ static void handle_session(struct ceph_mds_session *session, break; case CEPH_SESSION_CLOSE: + if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) + pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ complete(&mdsc->session_close_waiters); - kick_requests(mdsc, mds, 0); /* cur only */ + kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: @@ -2132,54 +2229,44 @@ out: * * called with mdsc->mutex held. */ -static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) +static void send_mds_reconnect(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { - struct ceph_mds_session *session = NULL; struct ceph_msg *reply; struct rb_node *p; + int mds = session->s_mds; int err = -ENOMEM; struct ceph_pagelist *pagelist; - pr_info("reconnect to recovering mds%d\n", mds); + pr_info("mds%d reconnect start\n", mds); pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) goto fail_nopagelist; ceph_pagelist_init(pagelist); - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); + if (!reply) goto fail_nomsg; - } - - /* find session */ - session = __ceph_lookup_mds_session(mdsc, mds); - mutex_unlock(&mdsc->mutex); /* drop lock for duration */ - if (session) { - mutex_lock(&session->s_mutex); + mutex_lock(&session->s_mutex); + session->s_state = CEPH_MDS_SESSION_RECONNECTING; + session->s_seq = 0; - session->s_state = CEPH_MDS_SESSION_RECONNECTING; - session->s_seq = 0; + ceph_con_open(&session->s_con, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - ceph_con_open(&session->s_con, - ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - - /* replay unsafe requests */ - replay_unsafe_requests(mdsc, session); - } else { - dout("no session for mds%d, will send short reconnect\n", - mds); - } + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); down_read(&mdsc->snap_rwsem); - if (!session) - goto send; dout("session %p state %s\n", session, session_state_name(session->s_state)); + /* drop old cap expires; we're about to reestablish that state */ + discard_cap_releases(mdsc, session); + /* traverse this session's caps */ err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); if (err) @@ -2208,36 +2295,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) goto fail; } -send: reply->pagelist = pagelist; reply->hdr.data_len = cpu_to_le32(pagelist->length); reply->nr_pages = calc_pages_for(0, pagelist->length); ceph_con_send(&session->s_con, reply); - session->s_state = CEPH_MDS_SESSION_OPEN; mutex_unlock(&session->s_mutex); mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); - ceph_put_mds_session(session); - up_read(&mdsc->snap_rwsem); - mutex_lock(&mdsc->mutex); return; fail: ceph_msg_put(reply); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); fail_nomsg: ceph_pagelist_release(pagelist); kfree(pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); - mutex_lock(&mdsc->mutex); return; } @@ -2290,7 +2370,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i, 1); + kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -2299,22 +2379,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, * send reconnect? */ if (s->s_state == CEPH_MDS_SESSION_RESTARTING && - newstate >= CEPH_MDS_STATE_RECONNECT) - send_mds_reconnect(mdsc, i); + newstate >= CEPH_MDS_STATE_RECONNECT) { + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + mutex_lock(&mdsc->mutex); + } /* - * kick requests on any mds that has gone active. - * - * kick requests on cur or forwarder: we may have sent - * the request to mds1, mds1 told us it forwarded it - * to mds2, but then we learn mds1 failed and can't be - * sure it successfully forwarded our request before - * it died. + * kick request on any mds that has gone active. */ if (oldstate < CEPH_MDS_STATE_ACTIVE && newstate >= CEPH_MDS_STATE_ACTIVE) { - pr_info("mds%d reconnect completed\n", s->s_mds); - kick_requests(mdsc, i, 1); + if (oldstate != CEPH_MDS_STATE_CREATING && + oldstate != CEPH_MDS_STATE_STARTING) + pr_info("mds%d recovery completed\n", s->s_mds); + kick_requests(mdsc, i); ceph_kick_flushing_caps(mdsc, s); wake_up_session_caps(s, 1); } @@ -2457,8 +2536,8 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, dnamelen = dentry->d_name.len; len += dnamelen; - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); - if (IS_ERR(msg)) + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); + if (!msg) return; lease = msg->front.iov_base; lease->action = action; @@ -2603,7 +2682,9 @@ static void delayed_work(struct work_struct *work) else ceph_con_keepalive(&s->s_con); add_cap_releases(mdsc, s, -1); - send_cap_releases(mdsc, s); + if (s->s_state == CEPH_MDS_SESSION_OPEN || + s->s_state == CEPH_MDS_SESSION_HUNG) + send_cap_releases(mdsc, s); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); @@ -2620,6 +2701,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) mdsc->client = client; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); + if (mdsc->mdsmap == NULL) + return -ENOMEM; + init_completion(&mdsc->safe_umount_waiters); init_completion(&mdsc->session_close_waiters); INIT_LIST_HEAD(&mdsc->waiting_for_map); @@ -2645,6 +2729,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) init_waitqueue_head(&mdsc->cap_flushing_wq); spin_lock_init(&mdsc->dentry_lru_lock); INIT_LIST_HEAD(&mdsc->dentry_lru); + return 0; } @@ -2740,6 +2825,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return; + dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; @@ -2922,9 +3010,10 @@ static void con_put(struct ceph_connection *con) static void peer_reset(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; - pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", - s->s_mds); + pr_warning("mds%d closed our session\n", s->s_mds); + send_mds_reconnect(mdsc, s); } static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) @@ -3031,7 +3120,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->client->monc); } -const static struct ceph_connection_operations mds_con_ops = { +static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, .dispatch = dispatch, diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 961cc6f6587..d9936c4f121 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -165,6 +165,8 @@ struct ceph_mds_request { struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ struct inode *r_target_inode; /* resulting inode */ + struct mutex r_fill_mutex; + union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ @@ -213,7 +215,7 @@ struct ceph_mds_request { struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ - bool r_got_unsafe, r_got_safe; + bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; u32 r_readdir_offset; @@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode, struct dentry *dn, int mask); +extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); + extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cd4fadb6491..60b74839ebe 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); static void ceph_fault(struct ceph_connection *con); -const char *ceph_name_type_str(int t) -{ - switch (t) { - case CEPH_ENTITY_TYPE_MON: return "mon"; - case CEPH_ENTITY_TYPE_MDS: return "mds"; - case CEPH_ENTITY_TYPE_OSD: return "osd"; - case CEPH_ENTITY_TYPE_CLIENT: return "client"; - case CEPH_ENTITY_TYPE_ADMIN: return "admin"; - default: return "???"; - } -} - /* * nicely render a sockaddr as a string. */ @@ -340,6 +328,7 @@ static void reset_connection(struct ceph_connection *con) ceph_msg_put(con->out_msg); con->out_msg = NULL; } + con->out_keepalive_pending = false; con->in_seq = 0; con->in_seq_acked = 0; } @@ -357,6 +346,7 @@ void ceph_con_close(struct ceph_connection *con) clear_bit(WRITE_PENDING, &con->state); mutex_lock(&con->mutex); reset_connection(con); + con->peer_global_seq = 0; cancel_delayed_work(&con->work); mutex_unlock(&con->mutex); queue_con(con); @@ -661,7 +651,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr, dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); - con->out_connect.features = CEPH_FEATURE_SUPPORTED; + con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT; con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); @@ -1124,8 +1114,8 @@ static void fail_protocol(struct ceph_connection *con) static int process_connect(struct ceph_connection *con) { - u64 sup_feat = CEPH_FEATURE_SUPPORTED; - u64 req_feat = CEPH_FEATURE_REQUIRED; + u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; + u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; u64 server_feat = le64_to_cpu(con->in_reply.features); dout("process_connect on %p tag %d\n", con, (int)con->in_tag); @@ -1233,6 +1223,7 @@ static int process_connect(struct ceph_connection *con) clear_bit(CONNECTING, &con->state); con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; + con->peer_features = server_feat; dout("process_connect got READY gseq %d cseq %d (%d)\n", con->peer_global_seq, le32_to_cpu(con->in_reply.connect_seq), @@ -1402,19 +1393,17 @@ static int read_partial_message(struct ceph_connection *con) con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); if (skip) { /* skip this message */ - dout("alloc_msg returned NULL, skipping message\n"); + dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; return 0; } - if (IS_ERR(con->in_msg)) { - ret = PTR_ERR(con->in_msg); - con->in_msg = NULL; + if (!con->in_msg) { con->error_msg = "error allocating memory for incoming message"; - return ret; + return -ENOMEM; } m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ @@ -1514,14 +1503,14 @@ static void process_message(struct ceph_connection *con) /* if first message, set peer_name */ if (con->peer_name.type == 0) - con->peer_name = msg->hdr.src.name; + con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), - ENTITY_NAME(msg->hdr.src.name), + ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), @@ -1546,7 +1535,6 @@ static int try_write(struct ceph_connection *con) dout("try_write start %p state %lu nref %d\n", con, con->state, atomic_read(&con->nref)); - mutex_lock(&con->mutex); more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); @@ -1639,7 +1627,6 @@ do_next: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_write done on %p\n", con); return ret; } @@ -1651,7 +1638,6 @@ out: */ static int try_read(struct ceph_connection *con) { - struct ceph_messenger *msgr; int ret = -1; if (!con->sock) @@ -1661,9 +1647,6 @@ static int try_read(struct ceph_connection *con) return 0; dout("try_read start on %p\n", con); - msgr = con->msgr; - - mutex_lock(&con->mutex); more: dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, @@ -1758,7 +1741,6 @@ more: done: ret = 0; out: - mutex_unlock(&con->mutex); dout("try_read done on %p\n", con); return ret; @@ -1830,6 +1812,8 @@ more: dout("con_work %p start, clearing QUEUED\n", con); clear_bit(QUEUED, &con->state); + mutex_lock(&con->mutex); + if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ dout("con_work CLOSED\n"); con_close_socket(con); @@ -1844,11 +1828,16 @@ more: if (test_and_clear_bit(SOCK_CLOSED, &con->state) || try_read(con) < 0 || try_write(con) < 0) { + mutex_unlock(&con->mutex); backoff = 1; ceph_fault(con); /* error/fault path */ + goto done_unlocked; } done: + mutex_unlock(&con->mutex); + +done_unlocked: clear_bit(BUSY, &con->state); dout("con->state=%lu\n", con->state); if (test_bit(QUEUED, &con->state)) { @@ -1947,7 +1936,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) /* the zero page is needed if a request is "canceled" while the message * is being written over the socket */ - msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); if (!msgr->zero_page) { kfree(msgr); return ERR_PTR(-ENOMEM); @@ -1987,9 +1976,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) } /* set src+dst */ - msg->hdr.src.name = con->msgr->inst.name; - msg->hdr.src.addr = con->msgr->my_enc_addr; - msg->hdr.orig_src = msg->hdr.src; + msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); @@ -2083,12 +2070,11 @@ void ceph_con_keepalive(struct ceph_connection *con) * construct a new message with given type, size * the new msg has a ref count of 1. */ -struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, struct page **pages) +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) { struct ceph_msg *m; - m = kmalloc(sizeof(*m), GFP_NOFS); + m = kmalloc(sizeof(*m), flags); if (m == NULL) goto out; kref_init(&m->kref); @@ -2100,8 +2086,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); m->hdr.middle_len = 0; - m->hdr.data_len = cpu_to_le32(page_len); - m->hdr.data_off = cpu_to_le16(page_off); + m->hdr.data_len = 0; + m->hdr.data_off = 0; m->hdr.reserved = 0; m->footer.front_crc = 0; m->footer.middle_crc = 0; @@ -2115,11 +2101,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, /* front */ if (front_len) { if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, GFP_NOFS, + m->front.iov_base = __vmalloc(front_len, flags, PAGE_KERNEL); m->front_is_vmalloc = true; } else { - m->front.iov_base = kmalloc(front_len, GFP_NOFS); + m->front.iov_base = kmalloc(front_len, flags); } if (m->front.iov_base == NULL) { pr_err("msg_new can't allocate %d bytes\n", @@ -2135,19 +2121,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, m->middle = NULL; /* data */ - m->nr_pages = calc_pages_for(page_off, page_len); - m->pages = pages; + m->nr_pages = 0; + m->pages = NULL; m->pagelist = NULL; - dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, - m->nr_pages); + dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: - pr_err("msg_new can't create type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + pr_err("msg_new can't create type %d front %d\n", type, front_len); + return NULL; } /* @@ -2190,29 +2175,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (IS_ERR(msg)) - return msg; - - if (*skip) + if (!msg || *skip) return NULL; } if (!msg) { *skip = 0; - msg = ceph_msg_new(type, front_len, 0, 0, NULL); + msg = ceph_msg_new(type, front_len, GFP_NOFS); if (!msg) { pr_err("unable to allocate msg type %d len %d\n", type, front_len); - return ERR_PTR(-ENOMEM); + return NULL; } } memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); - if (middle_len) { + if (middle_len && !msg->middle) { ret = ceph_alloc_middle(con, msg); - if (ret < 0) { ceph_msg_put(msg); - return msg; + return NULL; } } diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index a5caf91cc97..00a9430b1ff 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h @@ -49,10 +49,8 @@ struct ceph_connection_operations { int *skip); }; -extern const char *ceph_name_type_str(int t); - /* use format string %s%d */ -#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) +#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { struct ceph_entity_inst inst; /* my name+address */ @@ -144,6 +142,7 @@ struct ceph_connection { struct ceph_entity_addr peer_addr; /* peer address */ struct ceph_entity_name peer_name; /* peer name */ struct ceph_entity_addr peer_addr_for_me; + unsigned peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -158,7 +157,6 @@ struct ceph_connection { struct list_head out_queue; struct list_head out_sent; /* sending or sent but unacked */ u64 out_seq; /* last message queued for send */ - u64 out_seq_sent; /* last message sent */ bool out_keepalive_pending; u64 in_seq, in_seq_acked; /* last message received, acked */ @@ -234,9 +232,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); extern void ceph_con_put(struct ceph_connection *con); -extern struct ceph_msg *ceph_msg_new(int type, int front_len, - int page_len, int page_off, - struct page **pages); +extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); extern void ceph_msg_kfree(struct ceph_msg *m); diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c index 8fdc011ca95..f6510a476e7 100644 --- a/fs/ceph/mon_client.c +++ b/fs/ceph/mon_client.c @@ -28,7 +28,7 @@ * resend any outstanding requests. */ -const static struct ceph_connection_operations mon_con_ops; +static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); @@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) monc->pending_auth = 1; monc->m_auth->front.iov_len = len; monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_con_revoke(monc->con, monc->m_auth); ceph_msg_get(monc->m_auth); /* keep our ref */ ceph_con_send(monc->con, monc->m_auth); } @@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc) monc->want_next_osdmap); if ((__sub_expired(monc) && !monc->sub_sent) || monc->want_next_osdmap == 1) { - struct ceph_msg *msg; + struct ceph_msg *msg = monc->m_subscribe; struct ceph_mon_subscribe_item *i; void *p, *end; - msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL); - if (!msg) - return; - p = msg->front.iov_base; - end = p + msg->front.iov_len; + end = p + msg->front_max; dout("__send_subscribe to 'mdsmap' %u+\n", (unsigned)monc->have_mdsmap); @@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc) msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - ceph_con_send(monc->con, msg); + ceph_con_revoke(monc->con, msg); + ceph_con_send(monc->con, ceph_msg_get(msg)); monc->sub_sent = jiffies | 1; /* never 0 */ } @@ -353,14 +351,14 @@ out: /* * statfs */ -static struct ceph_mon_statfs_request *__lookup_statfs( +static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) { - struct ceph_mon_statfs_request *req; - struct rb_node *n = monc->statfs_request_tree.rb_node; + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; while (n) { - req = rb_entry(n, struct ceph_mon_statfs_request, node); + req = rb_entry(n, struct ceph_mon_generic_request, node); if (tid < req->tid) n = n->rb_left; else if (tid > req->tid) @@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs( return NULL; } -static void __insert_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *new) +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) { - struct rb_node **p = &monc->statfs_request_tree.rb_node; + struct rb_node **p = &monc->generic_request_tree.rb_node; struct rb_node *parent = NULL; - struct ceph_mon_statfs_request *req = NULL; + struct ceph_mon_generic_request *req = NULL; while (*p) { parent = *p; - req = rb_entry(parent, struct ceph_mon_statfs_request, node); + req = rb_entry(parent, struct ceph_mon_generic_request, node); if (new->tid < req->tid) p = &(*p)->rb_left; else if (new->tid > req->tid) @@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc, } rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->statfs_request_tree); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; } static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_msg *msg) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs_reply *reply = msg->front.iov_base; - u64 tid; + u64 tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len != sizeof(*reply)) goto bad; - tid = le64_to_cpu(msg->hdr.tid); dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_statfs(monc, tid); + req = __lookup_generic_req(monc, tid); if (req) { - *req->buf = reply->st; + *(struct ceph_statfs *)req->buf = reply->st; req->result = 0; + get_generic_request(req); } mutex_unlock(&monc->mutex); - if (req) + if (req) { complete(&req->completion); + put_generic_request(req); + } return; bad: - pr_err("corrupt statfs reply, no tid\n"); + pr_err("corrupt generic reply, no tid\n"); ceph_msg_dump(msg); } /* - * (re)send a statfs request + * Do a synchronous statfs(). */ -static int send_statfs(struct ceph_mon_client *monc, - struct ceph_mon_statfs_request *req) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { - struct ceph_msg *msg; + struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; + int err; - dout("send_statfs tid %llu\n", req->tid); - msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); - if (IS_ERR(msg)) - return PTR_ERR(msg); - req->request = msg; - msg->hdr.tid = cpu_to_le64(req->tid); - h = msg->front.iov_base; + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; - ceph_con_send(monc->con, msg); - return 0; -} - -/* - * Do a synchronous statfs(). - */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) -{ - struct ceph_mon_statfs_request req; - int err; - - req.buf = buf; - init_completion(&req.completion); - - /* allocate memory for reply */ - err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1); - if (err) - return err; /* register request */ mutex_lock(&monc->mutex); - req.tid = ++monc->last_tid; - req.last_attempt = jiffies; - req.delay = BASE_DELAY_INTERVAL; - __insert_statfs(monc, &req); - monc->num_statfs_requests++; + req->tid = ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; mutex_unlock(&monc->mutex); /* send request and wait */ - err = send_statfs(monc, &req); - if (!err) - err = wait_for_completion_interruptible(&req.completion); + ceph_con_send(monc->con, ceph_msg_get(req->request)); + err = wait_for_completion_interruptible(&req->completion); mutex_lock(&monc->mutex); - rb_erase(&req.node, &monc->statfs_request_tree); - monc->num_statfs_requests--; - ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; mutex_unlock(&monc->mutex); if (!err) - err = req.result; + err = req->result; + +out: + kref_put(&req->kref, release_generic_request); return err; } /* * Resend pending statfs requests. */ -static void __resend_statfs(struct ceph_mon_client *monc) +static void __resend_generic_request(struct ceph_mon_client *monc) { - struct ceph_mon_statfs_request *req; + struct ceph_mon_generic_request *req; struct rb_node *p; - for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { - req = rb_entry(p, struct ceph_mon_statfs_request, node); - send_statfs(monc, req); + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_con_revoke(monc->con, req->request); + ceph_con_send(monc->con, ceph_msg_get(req->request)); } } @@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; - /* msg pools */ - err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, - sizeof(struct ceph_mon_subscribe_ack), 1, false); - if (err < 0) + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS); + if (!monc->m_subscribe_ack) goto out_monmap; - err = ceph_msgpool_init(&monc->msgpool_statfs_reply, - sizeof(struct ceph_mon_statfs_reply), 0, false); - if (err < 0) - goto out_pool1; - err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); - if (err < 0) - goto out_pool2; - - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); monc->pending_auth = 0; - if (IS_ERR(monc->m_auth)) { - err = PTR_ERR(monc->m_auth); - monc->m_auth = NULL; - goto out_pool3; - } + if (!monc->m_auth) + goto out_auth_reply; monc->cur_mon = -1; monc->hunting = true; @@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->sub_sent = 0; INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); - monc->statfs_request_tree = RB_ROOT; - monc->num_statfs_requests = 0; + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; monc->last_tid = 0; monc->have_mdsmap = 0; @@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) monc->want_next_osdmap = 1; return 0; -out_pool3: - ceph_msgpool_destroy(&monc->msgpool_auth_reply); -out_pool2: - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); -out_pool1: - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); out_monmap: kfree(monc->monmap); out: @@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); ceph_msg_put(monc->m_auth); - ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); - ceph_msgpool_destroy(&monc->msgpool_statfs_reply); - ceph_msgpool_destroy(&monc->msgpool_auth_reply); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); kfree(monc->monmap); } @@ -681,7 +723,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, monc->client->msgr->inst.name.num = monc->auth->global_id; __send_subscribe(monc); - __resend_statfs(monc); + __resend_generic_request(monc); } mutex_unlock(&monc->mutex); } @@ -770,18 +812,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_MON_SUBSCRIBE_ACK: - m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); + m = ceph_msg_get(monc->m_subscribe_ack); break; case CEPH_MSG_STATFS_REPLY: - m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); - break; + return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: - m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); + m = ceph_msg_get(monc->m_auth_reply); break; case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: - m = ceph_msg_new(type, front_len, 0, 0, NULL); + m = ceph_msg_new(type, front_len, GFP_NOFS); break; } @@ -826,7 +867,7 @@ out: mutex_unlock(&monc->mutex); } -const static struct ceph_connection_operations mon_con_ops = { +static const struct ceph_connection_operations mon_con_ops = { .get = ceph_con_get, .put = ceph_con_put, .dispatch = dispatch, diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h index b958ad5afa0..174d794321d 100644 --- a/fs/ceph/mon_client.h +++ b/fs/ceph/mon_client.h @@ -2,10 +2,10 @@ #define _FS_CEPH_MON_CLIENT_H #include <linux/completion.h> +#include <linux/kref.h> #include <linux/rbtree.h> #include "messenger.h" -#include "msgpool.h" struct ceph_client; struct ceph_mount_args; @@ -22,7 +22,7 @@ struct ceph_monmap { }; struct ceph_mon_client; -struct ceph_mon_statfs_request; +struct ceph_mon_generic_request; /* @@ -40,17 +40,19 @@ struct ceph_mon_request { }; /* - * statfs() is done a bit differently because we need to get data back + * ceph_mon_generic_request is being used for the statfs and poolop requests + * which are bening done a bit differently because we need to get data back * to the caller */ -struct ceph_mon_statfs_request { +struct ceph_mon_generic_request { + struct kref kref; u64 tid; struct rb_node node; int result; - struct ceph_statfs *buf; + void *buf; struct completion completion; - unsigned long last_attempt, delay; /* jiffies */ struct ceph_msg *request; /* original request */ + struct ceph_msg *reply; /* and reply */ }; struct ceph_mon_client { @@ -61,7 +63,7 @@ struct ceph_mon_client { struct delayed_work delayed_work; struct ceph_auth_client *auth; - struct ceph_msg *m_auth; + struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack; int pending_auth; bool hunting; @@ -70,14 +72,9 @@ struct ceph_mon_client { struct ceph_connection *con; bool have_fsid; - /* msg pools */ - struct ceph_msgpool msgpool_subscribe_ack; - struct ceph_msgpool msgpool_statfs_reply; - struct ceph_msgpool msgpool_auth_reply; - - /* pending statfs requests */ - struct rb_root statfs_request_tree; - int num_statfs_requests; + /* pending generic requests */ + struct rb_root generic_request_tree; + int num_generic_requests; u64 last_tid; /* mds/osd map */ diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c index ca3b44a89f2..dd65a643813 100644 --- a/fs/ceph/msgpool.c +++ b/fs/ceph/msgpool.c @@ -7,180 +7,58 @@ #include "msgpool.h" -/* - * We use msg pools to preallocate memory for messages we expect to - * receive over the wire, to avoid getting ourselves into OOM - * conditions at unexpected times. We take use a few different - * strategies: - * - * - for request/response type interactions, we preallocate the - * memory needed for the response when we generate the request. - * - * - for messages we can receive at any time from the MDS, we preallocate - * a pool of messages we can re-use. - * - * - for writeback, we preallocate some number of messages to use for - * requests and their replies, so that we always make forward - * progress. - * - * The msgpool behaves like a mempool_t, but keeps preallocated - * ceph_msgs strung together on a list_head instead of using a pointer - * vector. This avoids vector reallocation when we adjust the number - * of preallocated items (which happens frequently). - */ +static void *alloc_fn(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + void *p; + p = ceph_msg_new(0, pool->front_len, gfp_mask); + if (!p) + pr_err("msgpool %s alloc failed\n", pool->name); + return p; +} -/* - * Allocate or release as necessary to meet our target pool size. - */ -static int __fill_msgpool(struct ceph_msgpool *pool) +static void free_fn(void *element, void *arg) { - struct ceph_msg *msg; - - while (pool->num < pool->min) { - dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num, - pool->min); - spin_unlock(&pool->lock); - msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL); - spin_lock(&pool->lock); - if (IS_ERR(msg)) - return PTR_ERR(msg); - msg->pool = pool; - list_add(&msg->list_head, &pool->msgs); - pool->num++; - } - while (pool->num > pool->min) { - msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head); - dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num, - pool->min, msg); - list_del_init(&msg->list_head); - pool->num--; - ceph_msg_kfree(msg); - } - return 0; + ceph_msg_put(element); } int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int min, bool blocking) + int front_len, int size, bool blocking, const char *name) { - int ret; - - dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min); - spin_lock_init(&pool->lock); pool->front_len = front_len; - INIT_LIST_HEAD(&pool->msgs); - pool->num = 0; - pool->min = min; - pool->blocking = blocking; - init_waitqueue_head(&pool->wait); - - spin_lock(&pool->lock); - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; + pool->pool = mempool_create(size, alloc_fn, free_fn, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; } void ceph_msgpool_destroy(struct ceph_msgpool *pool) { - dout("msgpool_destroy %p\n", pool); - spin_lock(&pool->lock); - pool->min = 0; - __fill_msgpool(pool); - spin_unlock(&pool->lock); + mempool_destroy(pool->pool); } -int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) { - int ret; - - spin_lock(&pool->lock); - dout("msgpool_resv %p delta %d\n", pool, delta); - pool->min += delta; - ret = __fill_msgpool(pool); - spin_unlock(&pool->lock); - return ret; -} - -struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len) -{ - wait_queue_t wait; - struct ceph_msg *msg; - - if (front_len && front_len > pool->front_len) { - pr_err("msgpool_get pool %p need front %d, pool size is %d\n", - pool, front_len, pool->front_len); + if (front_len > pool->front_len) { + pr_err("msgpool_get pool %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); WARN_ON(1); /* try to alloc a fresh message */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - } - - if (!front_len) - front_len = pool->front_len; - - if (pool->blocking) { - /* mempool_t behavior; first try to alloc */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; + return ceph_msg_new(0, front_len, GFP_NOFS); } - while (1) { - spin_lock(&pool->lock); - if (likely(pool->num)) { - msg = list_entry(pool->msgs.next, struct ceph_msg, - list_head); - list_del_init(&msg->list_head); - pool->num--; - dout("msgpool_get %p got %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - return msg; - } - pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num, - pool->min, pool->blocking ? "waiting" : "may fail"); - spin_unlock(&pool->lock); - - if (!pool->blocking) { - WARN_ON(1); - - /* maybe we can allocate it now? */ - msg = ceph_msg_new(0, front_len, 0, 0, NULL); - if (!IS_ERR(msg)) - return msg; - - pr_err("msgpool_get %p empty + alloc failed\n", pool); - return ERR_PTR(-ENOMEM); - } - - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - schedule(); - finish_wait(&pool->wait, &wait); - } + return mempool_alloc(pool->pool, GFP_NOFS); } void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) { - spin_lock(&pool->lock); - if (pool->num < pool->min) { - /* reset msg front_len; user may have changed it */ - msg->front.iov_len = pool->front_len; - msg->hdr.front_len = cpu_to_le32(pool->front_len); + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); - kref_set(&msg->kref, 1); /* retake a single ref */ - list_add(&msg->list_head, &pool->msgs); - pool->num++; - dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - wake_up(&pool->wait); - } else { - dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg, - pool->num, pool->min); - spin_unlock(&pool->lock); - ceph_msg_kfree(msg); - } + kref_init(&msg->kref); /* retake single ref */ } diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h index bc834bfcd72..a362605f936 100644 --- a/fs/ceph/msgpool.h +++ b/fs/ceph/msgpool.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_MSGPOOL #define _FS_CEPH_MSGPOOL +#include <linux/mempool.h> #include "messenger.h" /* @@ -8,18 +9,15 @@ * avoid unexpected OOM conditions. */ struct ceph_msgpool { - spinlock_t lock; + const char *name; + mempool_t *pool; int front_len; /* preallocated payload size */ - struct list_head msgs; /* msgs in the pool; each has 1 ref */ - int num, min; /* cur, min # msgs in the pool */ - bool blocking; - wait_queue_head_t wait; }; extern int ceph_msgpool_init(struct ceph_msgpool *pool, - int front_len, int size, bool blocking); + int front_len, int size, bool blocking, + const char *name); extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); -extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta); extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, int front_len); extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h index 8aaab414f3f..892a0298dfd 100644 --- a/fs/ceph/msgr.h +++ b/fs/ceph/msgr.h @@ -50,7 +50,6 @@ struct ceph_entity_name { #define CEPH_ENTITY_TYPE_MDS 0x02 #define CEPH_ENTITY_TYPE_OSD 0x04 #define CEPH_ENTITY_TYPE_CLIENT 0x08 -#define CEPH_ENTITY_TYPE_ADMIN 0x10 #define CEPH_ENTITY_TYPE_AUTH 0x20 #define CEPH_ENTITY_TYPE_ANY 0xFF @@ -120,7 +119,7 @@ struct ceph_msg_connect_reply { /* * message header */ -struct ceph_msg_header { +struct ceph_msg_header_old { __le64 seq; /* message seq# for this session */ __le64 tid; /* transaction id */ __le16 type; /* message type */ @@ -138,6 +137,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 front_len; /* bytes in main payload */ + __le32 middle_len;/* bytes in middle payload */ + __le32 data_len; /* bytes of data payload */ + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + struct ceph_entity_name src; + __le32 reserved; + __le32 crc; /* header crc32c */ +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index 3514f71ff85..afa7bb3895c 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -16,7 +16,7 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 -const static struct ceph_connection_operations osd_con_ops; +static const struct ceph_connection_operations osd_con_ops; static int __kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *kickosd); @@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = kzalloc(sizeof(*req), GFP_NOFS); } if (req == NULL) - return ERR_PTR(-ENOMEM); + return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; @@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); else msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); - if (IS_ERR(msg)) { + OSD_OPREPLY_FRONT_LEN, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } req->r_reply = msg; @@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); - if (IS_ERR(msg)) { + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); + if (!msg) { ceph_osdc_put_request(req); - return ERR_PTR(PTR_ERR(msg)); + return NULL; } msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); memset(msg->front.iov_base, 0, msg->front.iov_len); @@ -715,7 +715,7 @@ static void handle_timeout(struct work_struct *work) * should mark the osd as failed and we should find out about * it from an updated osd map. */ - while (!list_empty(&osdc->req_lru)) { + while (timeout && !list_empty(&osdc->req_lru)) { req = list_entry(osdc->req_lru.next, struct ceph_osd_request, r_req_lru_item); @@ -1078,6 +1078,7 @@ done: if (newmap) kick_requests(osdc, NULL); up_read(&osdc->map_sem); + wake_up(&osdc->client->auth_wq); return; bad: @@ -1087,45 +1088,6 @@ bad: return; } - -/* - * A read request prepares specific pages that data is to be read into. - * When a message is being read off the wire, we call prepare_pages to - * find those pages. - * 0 = success, -1 failure. - */ -static int __prepare_pages(struct ceph_connection *con, - struct ceph_msg_header *hdr, - struct ceph_osd_request *req, - u64 tid, - struct ceph_msg *m) -{ - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - int ret = -1; - int data_len = le32_to_cpu(hdr->data_len); - unsigned data_off = le16_to_cpu(hdr->data_off); - - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); - - if (!osd) - return -1; - - osdc = osd->o_osdc; - - dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m, - tid, req->r_num_pages, want); - if (unlikely(req->r_num_pages < want)) - goto out; - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - ret = 0; /* success */ -out: - BUG_ON(ret < 0 || m->nr_pages < want); - - return ret; -} - /* * Register request, send initial attempt. */ @@ -1252,11 +1214,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->req_mempool) goto out; - err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); + err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, + "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, - OSD_OPREPLY_FRONT_LEN, 10, true); + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); if (err < 0) goto out_msgpool; return 0; @@ -1302,8 +1266,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1345,8 +1309,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!req) + return -ENOMEM; /* it may be a short write due to an object boundary */ req->r_pages = pages; @@ -1394,7 +1358,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) } /* - * lookup and return message for incoming reply + * lookup and return message for incoming reply. set up reply message + * pages. */ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, @@ -1407,7 +1372,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; - int err; tid = le64_to_cpu(hdr->tid); mutex_lock(&osdc->request_mutex); @@ -1425,13 +1389,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_con_filling_msg); ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); ceph_con_put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; } if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", front, (int)req->r_reply->front.iov_len); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); - if (IS_ERR(m)) + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); + if (!m) goto out; ceph_msg_put(req->r_reply); req->r_reply = m; @@ -1439,12 +1404,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - err = __prepare_pages(con, hdr, req, tid, m); - if (err < 0) { + unsigned data_off = le16_to_cpu(hdr->data_off); + int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); + + if (unlikely(req->r_num_pages < want)) { + pr_warning("tid %lld reply %d > expected %d pages\n", + tid, want, m->nr_pages); *skip = 1; ceph_msg_put(m); - m = ERR_PTR(err); + m = NULL; + goto out; } + m->pages = req->r_pages; + m->nr_pages = req->r_num_pages; } *skip = 0; req->r_con_filling_msg = ceph_con_get(con); @@ -1466,7 +1438,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, switch (type) { case CEPH_MSG_OSD_MAP: - return ceph_msg_new(type, front, 0, 0, NULL); + return ceph_msg_new(type, front, GFP_NOFS); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: @@ -1552,7 +1524,7 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } -const static struct ceph_connection_operations osd_con_ops = { +static const struct ceph_connection_operations osd_con_ops = { .get = get_osd_con, .put = put_osd_con, .dispatch = dispatch, diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c index 5f8dbf7c745..b6859f47d36 100644 --- a/fs/ceph/pagelist.c +++ b/fs/ceph/pagelist.c @@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl) static int ceph_pagelist_addpage(struct ceph_pagelist *pl) { - struct page *page = alloc_page(GFP_NOFS); + struct page *page = __page_cache_alloc(GFP_NOFS); if (!page) return -ENOMEM; pl->room += PAGE_SIZE; diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index fd56451a871..8fcc023056c 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -101,8 +101,8 @@ struct ceph_pg_pool { __le64 snap_seq; /* seq for per-pool snapshot */ __le32 snap_epoch; /* epoch of last snap */ __le32 num_snaps; - __le32 num_removed_snap_intervals; - __le64 uid; + __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ + __le64 auid; /* who owns the pg */ } __attribute__ ((packed)); /* @@ -208,6 +208,7 @@ enum { /* read */ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, + CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, /* write */ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, @@ -305,6 +306,22 @@ enum { #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ #define EBLACKLISTED ESHUTDOWN /* blacklisted */ +/* xattr comparison */ +enum { + CEPH_OSD_CMPXATTR_OP_NOP = 0, + CEPH_OSD_CMPXATTR_OP_EQ = 1, + CEPH_OSD_CMPXATTR_OP_NE = 2, + CEPH_OSD_CMPXATTR_OP_GT = 3, + CEPH_OSD_CMPXATTR_OP_GTE = 4, + CEPH_OSD_CMPXATTR_OP_LT = 5, + CEPH_OSD_CMPXATTR_OP_LTE = 6 +}; + +enum { + CEPH_OSD_CMPXATTR_MODE_STRING = 1, + CEPH_OSD_CMPXATTR_MODE_U64 = 2 +}; + /* * an individual object operation. each may be accompanied by some data * payload @@ -321,6 +338,8 @@ struct ceph_osd_op { struct { __le32 name_len; __le32 value_len; + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ } __attribute__ ((packed)) xattr; struct { __u8 class_len; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index d5114db7045..c0b26b6badb 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 110857ba926..7c663d9b9f8 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -8,14 +8,11 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/parser.h> -#include <linux/rwsem.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/string.h> -#include <linux/version.h> -#include <linux/vmalloc.h> #include "decode.h" #include "super.h" @@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) static int ceph_syncfs(struct super_block *sb, int wait) { dout("sync_fs %d\n", wait); - ceph_osdc_sync(&ceph_client(sb)->osdc); - ceph_mdsc_sync(&ceph_client(sb)->mdsc); + ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); + ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); dout("sync_fs %d done\n", wait); return 0; } +static int default_congestion_kb(void) +{ + int congestion_kb; + + /* + * Copied from NFS + * + * congestion size, scale with available memory. + * + * 64MB: 8192k + * 128MB: 11585k + * 256MB: 16384k + * 512MB: 23170k + * 1GB: 32768k + * 2GB: 46340k + * 4GB: 65536k + * 8GB: 92681k + * 16GB: 131072k + * + * This allows larger machines to have larger/more transfers. + * Limit the default to 256M + */ + congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); + if (congestion_kb > 256*1024) + congestion_kb = 256*1024; + + return congestion_kb; +} /** * ceph_show_options - Show mount options in /proc/mounts @@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, ",nocrc"); if (args->flags & CEPH_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); + + if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, ",mount_timeout=%d", args->mount_timeout); + if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl); + if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT) + seq_printf(m, ",osdtimeout=%d", args->osd_timeout); + if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, ",osdkeepalivetimeout=%d", + args->osd_keepalive_timeout); + if (args->wsize) + seq_printf(m, ",wsize=%d", args->wsize); + if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT) + seq_printf(m, ",rsize=%d", args->rsize); + if (args->congestion_kb != default_congestion_kb()) + seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb); + if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) + seq_printf(m, ",caps_wanted_delay_min=%d", + args->caps_wanted_delay_min); + if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) + seq_printf(m, ",caps_wanted_delay_max=%d", + args->caps_wanted_delay_max); + if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) + seq_printf(m, ",cap_release_safety=%d", + args->cap_release_safety); + if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT) + seq_printf(m, ",readdir_max_entries=%d", args->max_readdir); + if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) + seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes); if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", args->snapdir_name); if (args->name) @@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo) inode_init_once(&ci->vfs_inode); } -static int default_congestion_kb(void) -{ - int congestion_kb; - - /* - * Copied from NFS - * - * congestion size, scale with available memory. - * - * 64MB: 8192k - * 128MB: 11585k - * 256MB: 16384k - * 512MB: 23170k - * 1GB: 32768k - * 2GB: 46340k - * 4GB: 65536k - * 8GB: 92681k - * 16GB: 131072k - * - * This allows larger machines to have larger/more transfers. - * Limit the default to 256M - */ - congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10); - if (congestion_kb > 256*1024) - congestion_kb = 256*1024; - - return congestion_kb; -} - static int __init init_caches(void) { ceph_inode_cachep = kmem_cache_create("ceph_inode_info", @@ -308,7 +333,9 @@ enum { Opt_osd_idle_ttl, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_cap_release_safety, Opt_readdir_max_entries, + Opt_readdir_max_bytes, Opt_congestion_kb, Opt_last_int, /* int args above */ @@ -339,7 +366,9 @@ static match_table_t arg_tokens = { {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, + {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, @@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; - args->max_readdir = 1024; + args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; + args->max_readdir = CEPH_MAX_READDIR_DEFAULT; + args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; args->congestion_kb = default_congestion_kb(); /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ @@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options, case Opt_readdir_max_entries: args->max_readdir = intval; break; + case Opt_readdir_max_bytes: + args->max_readdir_bytes = intval; + break; case Opt_congestion_kb: args->congestion_kb = intval; break; @@ -682,9 +715,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_map(struct ceph_client *client) +static int have_mon_and_osd_map(struct ceph_client *client) { - return client->monc.monmap && client->monc.monmap->epoch; + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; } /* @@ -762,7 +796,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, if (err < 0) goto out; - while (!have_mon_map(client)) { + while (!have_mon_and_osd_map(client)) { err = -EIO; if (timeout && time_after_eq(jiffies, started + timeout)) goto out; @@ -770,8 +804,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, /* wait */ dout("mount waiting for mon_map\n"); err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_map(client) || (client->auth_err < 0), - timeout); + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); if (err == -EINTR || err == -ERESTARTSYS) goto out; if (client->auth_err < 0) { @@ -884,6 +918,8 @@ static int ceph_compare_super(struct super_block *sb, void *data) /* * construct our own bdi so we can control readahead, etc. */ +static atomic_long_t bdi_seq = ATOMIC_INIT(0); + static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) { int err; @@ -893,7 +929,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) client->backing_dev_info.ra_pages = (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT; - err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); + err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", + atomic_long_inc_return(&bdi_seq)); if (!err) sb->s_bdi = &client->backing_dev_info; return err; @@ -932,9 +969,9 @@ static int ceph_get_sb(struct file_system_type *fs_type, goto out; } - if (ceph_client(sb) != client) { + if (ceph_sb_to_client(sb) != client) { ceph_destroy_client(client); - client = ceph_client(sb); + client = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", client); } else { dout("get_sb using new client %p\n", client); @@ -952,8 +989,7 @@ static int ceph_get_sb(struct file_system_type *fs_type, out_splat: ceph_mdsc_close_sessions(&client->mdsc); - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); goto out_final; out: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 13513b80d87..3725c9ee9d0 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -52,24 +52,25 @@ struct ceph_mount_args { int sb_flags; + int flags; + struct ceph_fsid fsid; + struct ceph_entity_addr my_addr; int num_mon; struct ceph_entity_addr *mon_addr; - int flags; int mount_timeout; int osd_idle_ttl; - int caps_wanted_delay_min, caps_wanted_delay_max; - struct ceph_fsid fsid; - struct ceph_entity_addr my_addr; - int wsize; - int rsize; /* max readahead */ - int max_readdir; /* max readdir size */ - int congestion_kb; /* max readdir size */ int osd_timeout; int osd_keepalive_timeout; + int wsize; + int rsize; /* max readahead */ + int congestion_kb; /* max writeback in flight */ + int caps_wanted_delay_min, caps_wanted_delay_max; + int cap_release_safety; + int max_readdir; /* max readdir result (entires) */ + int max_readdir_bytes; /* max readdir result (bytes) */ char *snapdir_name; /* default ".snap" */ char *name; char *secret; - int cap_release_safety; }; /* @@ -80,13 +81,14 @@ struct ceph_mount_args { #define CEPH_OSD_KEEPALIVE_DEFAULT 5 #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ +#define CEPH_MAX_READDIR_DEFAULT 1024 +#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" #define CEPH_AUTH_NAME_DEFAULT "guest" - /* * Delay telling the MDS we no longer want caps, in case we reopen * the file. Delay a minimum amount of time, even if we send a cap @@ -96,6 +98,7 @@ struct ceph_mount_args { #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ +#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) /* mount state */ enum { @@ -160,12 +163,6 @@ struct ceph_client { #endif }; -static inline struct ceph_client *ceph_client(struct super_block *sb) -{ - return sb->s_fs_info; -} - - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -871,6 +868,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); +extern void ceph_invalidate_dentry_lease(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2845422907f..68aeebc6968 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -7,7 +7,8 @@ static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, + return !strncmp(name, "ceph.", 5) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); @@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { - { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, - { true, "user.ceph.dir.files", ceph_vxattrcb_files}, - { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, - { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, - { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, - { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, - { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, - { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, + { true, "ceph.dir.entries", ceph_vxattrcb_entries}, + { true, "ceph.dir.files", ceph_vxattrcb_files}, + { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs}, + { true, "ceph.dir.rentries", ceph_vxattrcb_rentries}, + { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles}, + { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, + { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes}, + { true, "ceph.dir.rctime", ceph_vxattrcb_rctime}, { true, NULL, NULL } }; @@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { - { true, "user.ceph.layout", ceph_vxattrcb_layout}, + { true, "ceph.layout", ceph_vxattrcb_layout}, { NULL, NULL } }; @@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci, ci->i_xattrs.names_size -= xattr->name_len; ci->i_xattrs.vals_size -= xattr->val_len; } - if (!xattr) { - pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n", - &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name, - xattr->val); - return -ENOMEM; - } ci->i_xattrs.names_size += name_len; ci->i_xattrs.vals_size += val_len; if (val) @@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ci->i_xattrs.version, ci->i_xattrs.index_version); if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version > ci->i_xattrs.version)) { + (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { goto list_xattr; } else { spin_unlock(&inode->i_lock); @@ -622,7 +617,7 @@ out: static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); struct inode *parent_inode = dentry->d_parent->d_inode; @@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; err = -ENOMEM; for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); + pages[i] = __page_cache_alloc(GFP_NOFS); if (!pages[i]) { nr_pages = i; goto out; @@ -779,7 +774,7 @@ out: static int ceph_send_removexattr(struct dentry *dentry, const char *name) { - struct ceph_client *client = ceph_client(dentry->d_sb); + struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = &client->mdsc; struct inode *inode = dentry->d_inode; struct inode *parent_inode = dentry->d_parent->d_inode; diff --git a/fs/coda/file.c b/fs/coda/file.c index 4c813f2cdc5..7196077b168 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); + err = vfs_fsync(host_file, datasync); if ( !err && !datasync ) { lock_kernel(); err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); diff --git a/fs/dcache.c b/fs/dcache.c index f1358e5c3a5..d96047b4a63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -536,7 +536,7 @@ restart: */ static void prune_dcache(int count) { - struct super_block *sb; + struct super_block *sb, *n; int w_count; int unused = dentry_stat.nr_unused; int prune_ratio; @@ -545,13 +545,14 @@ static void prune_dcache(int count) if (unused == 0 || count == 0) return; spin_lock(&dcache_lock); -restart: if (count >= unused) prune_ratio = 1; else prune_ratio = unused / count; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_nr_dentry_unused == 0) continue; sb->s_count++; @@ -590,14 +591,10 @@ restart: } spin_lock(&sb_lock); count -= pruned; - /* - * restart only when sb is no longer on the list and - * we have more work to do. - */ - if (__put_super_and_need_restart(sb) && count > 0) { - spin_unlock(&sb_lock); - goto restart; - } + __put_super(sb); + /* more work left to do? */ + if (count <= 0) + break; } spin_unlock(&sb_lock); spin_unlock(&dcache_lock); @@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry) spin_lock(&dentry->d_lock); isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { + dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_iput(dentry); fsnotify_nameremove(dentry, isdir); return; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0120247b41c..8b3ffd5b523 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, s); - memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); error = mknod_ptmx(s); if (error) - goto out_dput; + goto out_undo_sget; - return 0; + simple_set_mnt(mnt, s); -out_dput: - dput(s->s_root); /* undo dget() in simple_set_mnt() */ + return 0; out_undo_sget: deactivate_locked_super(s); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 31f4b0e6d72..83c4f600786 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -12,7 +12,7 @@ /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; -static void drop_pagecache_sb(struct super_block *sb) +static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; @@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb) iput(toput_inode); } -static void drop_pagecache(void) -{ - struct super_block *sb; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - drop_pagecache_sb(sb); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); -} - static void drop_slab(void) { int nr_objects; @@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write, proc_dointvec_minmax(table, write, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) - drop_pagecache(); + iterate_supers(drop_pagecache_sb, NULL); if (sysctl_drop_caches & 2) drop_slab(); } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bfc2e0f78f0..0032a9f5a3a 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, struct page *page_for_lower, size_t offset_in_page, size_t size); -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, - size_t size); +int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size); int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode); int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, pgoff_t page_index, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index); int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, struct user_namespace *user_ns); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e7440a6f5eb..3bdddbcc785 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -276,9 +276,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - return vfs_fsync(ecryptfs_file_to_lower(file), - ecryptfs_dentry_to_lower(dentry), - datasync); + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e2d4418affa..65dee2f336a 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -142,19 +142,10 @@ out: static int grow_file(struct dentry *ecryptfs_dentry) { struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; - struct file fake_file; - struct ecryptfs_file_info tmp_file_info; char zero_virt[] = { 0x00 }; int rc = 0; - memset(&fake_file, 0, sizeof(fake_file)); - fake_file.f_path.dentry = ecryptfs_dentry; - memset(&tmp_file_info, 0, sizeof(tmp_file_info)); - ecryptfs_set_file_private(&fake_file, &tmp_file_info); - ecryptfs_set_file_lower( - &fake_file, - ecryptfs_inode_to_private(ecryptfs_inode)->lower_file); - rc = ecryptfs_write(&fake_file, zero_virt, 0, 1); + rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1); i_size_write(ecryptfs_inode, 0); rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= @@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, { int rc = 0; struct inode *inode = dentry->d_inode; - struct dentry *lower_dentry; - struct file fake_ecryptfs_file; struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, goto out; } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - /* Set up a fake ecryptfs file, this is used to interface with - * the file in the underlying filesystem so that the - * truncation has an effect there as well. */ - memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file)); - fake_ecryptfs_file.f_path.dentry = dentry; - /* Released at out_free: label */ - ecryptfs_set_file_private(&fake_ecryptfs_file, - kmem_cache_alloc(ecryptfs_file_info_cache, - GFP_KERNEL)); - if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) { - rc = -ENOMEM; - goto out; - } - lower_dentry = ecryptfs_dentry_to_lower(dentry); - ecryptfs_set_file_lower( - &fake_ecryptfs_file, - ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ - rc = ecryptfs_write(&fake_ecryptfs_file, zero, + rc = ecryptfs_write(inode, zero, (ia->ia_size - 1), 1); } else { /* ia->ia_size < i_size_read(inode) */ /* We're chopping off all the pages down to the page @@ -835,10 +807,10 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { rc = vmtruncate(inode, ia->ia_size); if (rc) - goto out_free; + goto out; lower_ia->ia_size = ia->ia_size; lower_ia->ia_valid |= ATTR_SIZE; - goto out_free; + goto out; } if (num_zeros) { char *zeros_virt; @@ -846,16 +818,16 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, zeros_virt = kzalloc(num_zeros, GFP_KERNEL); if (!zeros_virt) { rc = -ENOMEM; - goto out_free; + goto out; } - rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, + rc = ecryptfs_write(inode, zeros_virt, ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " "the remainder of the end page on " "reducing truncate; rc = [%d]\n", rc); - goto out_free; + goto out; } } vmtruncate(inode, ia->ia_size); @@ -864,7 +836,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, printk(KERN_ERR "Problem with " "ecryptfs_write_inode_size_to_metadata; " "rc = [%d]\n", rc); - goto out_free; + goto out; } /* We are reducing the size of the ecryptfs file, and need to * know if we need to reduce the size of the lower file. */ @@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, } else lower_ia->ia_valid &= ~ATTR_SIZE; } -out_free: - if (ecryptfs_file_to_private(&fake_ecryptfs_file)) - kmem_cache_free(ecryptfs_file_info_cache, - ecryptfs_file_to_private(&fake_ecryptfs_file)); out: return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 760983d0f25..cbd4e18adb2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct super_block *sb, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) { char *p; int rc = 0; @@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; @@ -483,68 +483,7 @@ out: } struct kmem_cache *ecryptfs_sb_info_cache; - -/** - * ecryptfs_fill_super - * @sb: The ecryptfs super block - * @raw_data: The options passed to mount - * @silent: Not used but required by function prototype - * - * Sets up what we can of the sb, rest is done in ecryptfs_read_super - * - * Returns zero on success; non-zero otherwise - */ -static int -ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) -{ - struct ecryptfs_sb_info *esi; - int rc = 0; - - /* Released in ecryptfs_put_super() */ - ecryptfs_set_superblock_private(sb, - kmem_cache_zalloc(ecryptfs_sb_info_cache, - GFP_KERNEL)); - esi = ecryptfs_superblock_to_private(sb); - if (!esi) { - ecryptfs_printk(KERN_WARNING, "Out of memory\n"); - rc = -ENOMEM; - goto out; - } - - rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); - if (rc) - goto out; - - sb->s_bdi = &esi->bdi; - sb->s_op = &ecryptfs_sops; - /* Released through deactivate_super(sb) from get_sb_nodev */ - sb->s_root = d_alloc(NULL, &(const struct qstr) { - .hash = 0,.name = "/",.len = 1}); - if (!sb->s_root) { - ecryptfs_printk(KERN_ERR, "d_alloc failed\n"); - rc = -ENOMEM; - goto out; - } - sb->s_root->d_op = &ecryptfs_dops; - sb->s_root->d_sb = sb; - sb->s_root->d_parent = sb->s_root; - /* Released in d_release when dput(sb->s_root) is called */ - /* through deactivate_super(sb) from get_sb_nodev() */ - ecryptfs_set_dentry_private(sb->s_root, - kmem_cache_zalloc(ecryptfs_dentry_info_cache, - GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(sb->s_root)) { - ecryptfs_printk(KERN_ERR, - "dentry_info_cache alloc failed\n"); - rc = -ENOMEM; - goto out; - } - rc = 0; -out: - /* Should be able to rely on deactivate_super called from - * get_sb_nodev */ - return rc; -} +static struct file_system_type ecryptfs_fs_type; /** * ecryptfs_read_super @@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); goto out; } + if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { + rc = -EINVAL; + printk(KERN_ERR "Mount on filesystem of type " + "eCryptfs explicitly disallowed due to " + "known incompatibilities\n"); + goto out_free; + } ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; sb->s_blocksize = path.dentry->d_sb->s_blocksize; @@ -588,11 +534,8 @@ out: * @dev_name: The path to mount over * @raw_data: The options passed into the kernel * - * The whole ecryptfs_get_sb process is broken into 4 functions: + * The whole ecryptfs_get_sb process is broken into 3 functions: * ecryptfs_parse_options(): handle options passed to ecryptfs, if any - * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block - * with as much information as it can before needing - * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses * ecryptfs_interpose to perform most of the linking * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) @@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { + struct super_block *s; + struct ecryptfs_sb_info *sbi; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; int rc; - struct super_block *sb; - rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); - if (rc < 0) { - printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); + sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); + if (!sbi) { + rc = -ENOMEM; goto out; } - sb = mnt->mnt_sb; - rc = ecryptfs_parse_options(sb, raw_data); + + rc = ecryptfs_parse_options(sbi, raw_data); if (rc) { - printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); - goto out_abort; + err = "Error parsing options"; + goto out; + } + + s = sget(fs_type, NULL, set_anon_super, NULL); + if (IS_ERR(s)) { + rc = PTR_ERR(s); + goto out; } - rc = ecryptfs_read_super(sb, dev_name); + + s->s_flags = flags; + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); if (rc) { - printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); - goto out_abort; + deactivate_locked_super(s); + goto out; } - goto out; -out_abort: - dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ - deactivate_locked_super(sb); + + ecryptfs_set_superblock_private(s, sbi); + s->s_bdi = &sbi->bdi; + + /* ->kill_sb() will take care of sbi after that point */ + sbi = NULL; + s->s_op = &ecryptfs_sops; + + rc = -ENOMEM; + s->s_root = d_alloc(NULL, &(const struct qstr) { + .hash = 0,.name = "/",.len = 1}); + if (!s->s_root) { + deactivate_locked_super(s); + goto out; + } + s->s_root->d_op = &ecryptfs_dops; + s->s_root->d_sb = s; + s->s_root->d_parent = s->s_root; + + root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); + if (!root_info) { + deactivate_locked_super(s); + goto out; + } + /* ->kill_sb() will take care of root_info */ + ecryptfs_set_dentry_private(s->s_root, root_info); + s->s_flags |= MS_ACTIVE; + rc = ecryptfs_read_super(s, dev_name); + if (rc) { + deactivate_locked_super(s); + err = "Reading sb failed"; + goto out; + } + simple_set_mnt(mnt, s); + return 0; + out: + if (sbi) { + ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sbi); + } + printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return rc; } @@ -633,11 +624,16 @@ out: * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. - * Private data is free'd in ecryptfs_put_super() */ static void ecryptfs_kill_block_super(struct super_block *sb) { - generic_shutdown_super(sb); + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + kill_anon_super(sb); + if (!sb_info) + return; + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2ee9a3a7b68..b1d82756544 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -44,17 +44,9 @@ * Returns locked and up-to-date page (if ok), with increased * refcnt. */ -struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) +struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) { - struct dentry *dentry; - struct inode *inode; - struct address_space *mapping; - struct page *page; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, index, (void *)file); + struct page *page = read_mapping_page(inode->i_mapping, index, NULL); if (!IS_ERR(page)) lock_page(page); return page; @@ -198,7 +190,7 @@ out: static int ecryptfs_readpage(struct file *file, struct page *page) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; int rc = 0; if (!crypt_stat @@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file, if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private( - file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(mapping->host)->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { @@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file, unsigned to = from + copied; struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc; if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0cc4fafd655..db184ef15d3 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, /** * ecryptfs_write - * @ecryptfs_file: The eCryptfs file into which to write + * @ecryptfs_inode: The eCryptfs file into which to write * @data: Virtual address where data to write is located * @offset: Offset in the eCryptfs file at which to begin writing the * data from @data @@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, * * Returns zero on success; non-zero otherwise */ -int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, +int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { struct page *ecryptfs_page; struct ecryptfs_crypt_stat *crypt_stat; - struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; char *ecryptfs_page_virt; loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; @@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, if (num_bytes > total_remaining_zeros) num_bytes = total_remaining_zeros; } - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); @@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int ecryptfs_read(char *data, loff_t offset, size_t size, struct file *ecryptfs_file) { + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; struct page *ecryptfs_page; char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = - i_size_read(ecryptfs_file->f_dentry->d_inode); + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; loff_t pos; int rc = 0; @@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, if (num_bytes > total_remaining_bytes) num_bytes = total_remaining_bytes; - ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, + ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode, ecryptfs_page_idx); if (IS_ERR(ecryptfs_page)) { rc = PTR_ERR(ecryptfs_page); diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0c0ae491d23..0435886e4a9 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode) } /** - * ecryptfs_put_super - * @sb: Pointer to the ecryptfs super block - * - * Final actions when unmounting a file system. - * This will handle deallocation and release of our private data. - */ -static void ecryptfs_put_super(struct super_block *sb) -{ - struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); - - lock_kernel(); - - ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); - kmem_cache_free(ecryptfs_sb_info_cache, sb_info); - ecryptfs_set_superblock_private(sb, NULL); - - unlock_kernel(); -} - -/** * ecryptfs_statfs * @sb: The ecryptfs super block * @buf: The struct kstatfs to fill in with stats @@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = { .alloc_inode = ecryptfs_alloc_inode, .destroy_inode = ecryptfs_destroy_inode, .drop_inode = generic_delete_inode, - .put_super = ecryptfs_put_super, .statfs = ecryptfs_statfs, .remount_fs = NULL, .clear_inode = ecryptfs_clear_inode, diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 76d2a79ef93..d7c6afa7975 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1123,16 +1123,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) sbi = sb->s_fs_info; sb->s_dirt = 1; - inode->i_uid = current->cred->fsuid; - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current->cred->fsgid; - } - inode->i_mode = mode; - + inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index a99e54318c3..ca7e2a0ed98 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -420,7 +420,7 @@ release_and_out: return error; } -struct xattr_handler ext2_xattr_acl_access_handler = { +const struct xattr_handler ext2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext2_xattr_list_acl_access, @@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = { .set = ext2_xattr_set_acl, }; -struct xattr_handler ext2_xattr_acl_default_handler = { +const struct xattr_handler ext2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext2_xattr_list_acl_default, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index f0c5286f934..938dbc739d0 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -549,16 +549,12 @@ got: sb->s_dirt = 1; mark_buffer_dirty(bh2); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 3b96045a00c..7c3915780b1 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *, static struct mb_cache *ext2_xattr_cache; -static struct xattr_handler *ext2_xattr_handler_map[] = { +static const struct xattr_handler *ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, @@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext2_xattr_handlers[] = { +const struct xattr_handler *ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL @@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext2_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) handler = ext2_xattr_handler_map[name_index]; @@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", /* list the attribute names */ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); entry = EXT2_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext2_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index bf8175b2ced..a1a1c218461 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -55,11 +55,11 @@ struct ext2_xattr_entry { # ifdef CONFIG_EXT2_FS_XATTR -extern struct xattr_handler ext2_xattr_user_handler; -extern struct xattr_handler ext2_xattr_trusted_handler; -extern struct xattr_handler ext2_xattr_acl_access_handler; -extern struct xattr_handler ext2_xattr_acl_default_handler; -extern struct xattr_handler ext2_xattr_security_handler; +extern const struct xattr_handler ext2_xattr_user_handler; +extern const struct xattr_handler ext2_xattr_trusted_handler; +extern const struct xattr_handler ext2_xattr_acl_access_handler; +extern const struct xattr_handler ext2_xattr_acl_default_handler; +extern const struct xattr_handler ext2_xattr_security_handler; extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); @@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *); extern int init_ext2_xattr(void); extern void exit_ext2_xattr(void); -extern struct xattr_handler *ext2_xattr_handlers[]; +extern const struct xattr_handler *ext2_xattr_handlers[]; # else /* CONFIG_EXT2_FS_XATTR */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index b118c6383c6..3004e15d5da 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext2_xattr_security_handler = { +const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext2_xattr_security_list, .get = ext2_xattr_security_get, diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 2a26d71f477..667e46a8d62 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext2_xattr_trusted_handler = { +const struct xattr_handler ext2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext2_xattr_trusted_list, .get = ext2_xattr_trusted_get, diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 3f6caf3684b..099d20f4716 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext2_xattr_user_handler = { +const struct xattr_handler ext2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext2_xattr_user_list, .get = ext2_xattr_user_get, diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 82ba3415866..01552abbca3 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -456,7 +456,7 @@ release_and_out: return error; } -struct xattr_handler ext3_xattr_acl_access_handler = { +const struct xattr_handler ext3_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext3_xattr_list_acl_access, @@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = { .set = ext3_xattr_set_acl, }; -struct xattr_handler ext3_xattr_acl_default_handler = { +const struct xattr_handler ext3_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext3_xattr_list_acl_default, diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index 26289e8f416..fcf7487734b 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) * storage */ if (needs_barrier) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 0d0e97ed3ff..498021eb88f 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -538,16 +538,13 @@ got: if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - inode->i_uid = current_fsuid(); - if (test_opt (sb, GRPID)) - inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { + + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 534a94c3a93..71fb8d65e54 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext3_xattr_cache; -static struct xattr_handler *ext3_xattr_handler_map[] = { +static const struct xattr_handler *ext3_xattr_handler_map[] = { [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, @@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext3_xattr_handlers[] = { +const struct xattr_handler *ext3_xattr_handlers[] = { &ext3_xattr_user_handler, &ext3_xattr_trusted_handler, #ifdef CONFIG_EXT3_FS_POSIX_ACL @@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext3_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) handler = ext3_xattr_handler_map[name_index]; @@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext3_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 148a4dfc82a..377fe720116 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -58,11 +58,11 @@ struct ext3_xattr_entry { # ifdef CONFIG_EXT3_FS_XATTR -extern struct xattr_handler ext3_xattr_user_handler; -extern struct xattr_handler ext3_xattr_trusted_handler; -extern struct xattr_handler ext3_xattr_acl_access_handler; -extern struct xattr_handler ext3_xattr_acl_default_handler; -extern struct xattr_handler ext3_xattr_security_handler; +extern const struct xattr_handler ext3_xattr_user_handler; +extern const struct xattr_handler ext3_xattr_trusted_handler; +extern const struct xattr_handler ext3_xattr_acl_access_handler; +extern const struct xattr_handler ext3_xattr_acl_default_handler; +extern const struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); @@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *); extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); -extern struct xattr_handler *ext3_xattr_handlers[]; +extern const struct xattr_handler *ext3_xattr_handlers[]; # else /* CONFIG_EXT3_FS_XATTR */ diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 3af91f476df..03a99bfc59f 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext3_xattr_security_handler = { +const struct xattr_handler ext3_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext3_xattr_security_list, .get = ext3_xattr_security_get, diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index e5562845ed9..dc8edda9ffe 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, value, size, flags); } -struct xattr_handler ext3_xattr_trusted_handler = { +const struct xattr_handler ext3_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext3_xattr_trusted_list, .get = ext3_xattr_trusted_get, diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 3bcfe9ee0a6..7a321974d58 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext3_xattr_user_handler = { +const struct xattr_handler ext3_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext3_xattr_user_list, .get = ext3_xattr_user_get, diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8a2a29d35a6..feaf498feaa 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -454,7 +454,7 @@ release_and_out: return error; } -struct xattr_handler ext4_xattr_acl_access_handler = { +const struct xattr_handler ext4_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ext4_xattr_list_acl_access, @@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = { .set = ext4_xattr_set_acl, }; -struct xattr_handler ext4_xattr_acl_default_handler = { +const struct xattr_handler ext4_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ext4_xattr_list_acl_default, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0d0c3239c1c..ef3d980e67c 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -100,9 +100,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) if (ext4_should_writeback_data(inode) && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 57f6eef6ccd..1a0e183a2f0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -979,16 +979,12 @@ got: atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); } - inode->i_uid = current_fsuid(); - if (test_opt(sb, GRPID)) + if (test_opt(sb, GRPID)) { + inode->i_mode = mode; + inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; - else if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b4c5aa8489d..2de0e951508 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer, static struct mb_cache *ext4_xattr_cache; -static struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, @@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = { #endif }; -struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = { NULL }; -static inline struct xattr_handler * +static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; @@ -332,7 +332,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { - struct xattr_handler *handler = + const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); if (handler) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 8ede88b18c2..518e96e4390 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -65,11 +65,11 @@ struct ext4_xattr_entry { # ifdef CONFIG_EXT4_FS_XATTR -extern struct xattr_handler ext4_xattr_user_handler; -extern struct xattr_handler ext4_xattr_trusted_handler; -extern struct xattr_handler ext4_xattr_acl_access_handler; -extern struct xattr_handler ext4_xattr_acl_default_handler; -extern struct xattr_handler ext4_xattr_security_handler; +extern const struct xattr_handler ext4_xattr_user_handler; +extern const struct xattr_handler ext4_xattr_trusted_handler; +extern const struct xattr_handler ext4_xattr_acl_access_handler; +extern const struct xattr_handler ext4_xattr_acl_default_handler; +extern const struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); @@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, extern int init_ext4_xattr(void); extern void exit_ext4_xattr(void); -extern struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler *ext4_xattr_handlers[]; # else /* CONFIG_EXT4_FS_XATTR */ diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 8b145e98df0..9b21268e121 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) return err; } -struct xattr_handler ext4_xattr_security_handler = { +const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ext4_xattr_security_list, .get = ext4_xattr_security_get, diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 15b50edc658..37e6ebca2cc 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_trusted_handler = { +const struct xattr_handler ext4_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ext4_xattr_trusted_list, .get = ext4_xattr_trusted_get, diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index c4ce05746ce..98c375352d0 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ext4_xattr_user_handler = { +const struct xattr_handler ext4_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ext4_xattr_user_list, .get = ext4_xattr_user_get, diff --git a/fs/fcntl.c b/fs/fcntl.c index 0a140741b39..f74d270ba15 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -14,6 +14,7 @@ #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, case F_NOTIFY: err = fcntl_dirnotify(fd, filp, arg); break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; default: break; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b37f7cea4d..5c4161f1fd9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,9 +42,10 @@ struct wb_writeback_args { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; - int for_kupdate:1; - int range_cyclic:1; - int for_background:1; + unsigned int for_kupdate:1; + unsigned int range_cyclic:1; + unsigned int for_background:1; + unsigned int sb_pinned:1; }; /* @@ -192,7 +193,8 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args) + struct wb_writeback_args *args, + int wait) { struct bdi_work *work; @@ -204,6 +206,8 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, if (work) { bdi_work_init(work, args); bdi_queue_work(bdi, work); + if (wait) + bdi_wait_on_work_clear(work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -230,6 +234,11 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, + /* + * Setting sb_pinned is not necessary for WB_SYNC_ALL, but + * lets make it explicitly clear. + */ + .sb_pinned = 1, }; struct bdi_work work; @@ -245,21 +254,23 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * @bdi: the backing device to write from * @sb: write inodes from this super_block * @nr_pages: the number of pages to write + * @sb_locked: caller already holds sb umount sem. * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller need not hold sb s_umount semaphore. + * completion. Caller specifies whether sb umount sem is held already or not. * */ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages) + long nr_pages, int sb_locked) { struct wb_writeback_args args = { .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, + .sb_pinned = sb_locked, }; /* @@ -271,7 +282,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, args.for_background = 1; } - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, sb_locked); } /* @@ -452,11 +463,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) BUG_ON(inode->i_state & I_SYNC); - /* Set I_SYNC, reset I_DIRTY */ - dirty = inode->i_state & I_DIRTY; + /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; - inode->i_state &= ~I_DIRTY; - + inode->i_state &= ~I_DIRTY_PAGES; spin_unlock(&inode_lock); ret = do_writepages(mapping, wbc); @@ -472,6 +481,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } + /* + * Some filesystems may redirty the inode during the writeback + * due to delalloc, clear dirty metadata flags right before + * write_inode() + */ + spin_lock(&inode_lock); + dirty = inode->i_state & I_DIRTY; + inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + spin_unlock(&inode_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -577,7 +595,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, /* * Caller must already hold the ref for this */ - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); return SB_NOT_PINNED; } @@ -751,6 +769,7 @@ static long wb_writeback(struct bdi_writeback *wb, .for_kupdate = args->for_kupdate, .for_background = args->for_background, .range_cyclic = args->range_cyclic, + .sb_pinned = args->sb_pinned, }; unsigned long oldest_jif; long wrote = 0; @@ -852,6 +871,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) unsigned long expired; long nr_pages; + /* + * When set to zero, disable periodic writeback + */ + if (!dirty_writeback_interval) + return 0; + expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) @@ -887,6 +912,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct wb_writeback_args args = work->args; + int post_clear; /* * Override sync mode, in case we must wait for completion @@ -894,11 +920,13 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; + post_clear = WB_SYNC_ALL || args.sb_pinned; + /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (args.sync_mode == WB_SYNC_NONE) + if (!post_clear) wb_clear_pending(wb, work); wrote += wb_writeback(wb, &args); @@ -907,7 +935,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (args.sync_mode == WB_SYNC_ALL) + if (post_clear) wb_clear_pending(wb, work); } @@ -947,8 +975,17 @@ int bdi_writeback_task(struct bdi_writeback *wb) break; } - wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); - schedule_timeout_interruptible(wait_jiffies); + if (dirty_writeback_interval) { + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout_interruptible(wait_jiffies); + } else { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty_careful(&wb->bdi->work_list) && + !kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + try_to_freeze(); } @@ -974,7 +1011,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args); + bdi_alloc_queue_work(bdi, &args, 0); } rcu_read_unlock(); @@ -1183,6 +1220,18 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) +{ + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; + + nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); +} + /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1194,18 +1243,23 @@ static void wait_sb_inodes(struct super_block *sb) */ void writeback_inodes_sb(struct super_block *sb) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write); + __writeback_inodes_sb(sb, 0); } EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_locked - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Like writeback_inodes_sb(), except the caller already holds the + * sb umount sem. + */ +void writeback_inodes_sb_locked(struct super_block *sb) +{ + __writeback_inodes_sb(sb, 1); +} + +/** * writeback_inodes_sb_if_idle - start writeback if none underway * @sb: the superblock * diff --git a/fs/generic_acl.c b/fs/generic_acl.c index fe5df545765..99800e56415 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask) return -EAGAIN; } -struct xattr_handler generic_acl_access_handler = { +const struct xattr_handler generic_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = generic_acl_list, @@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = { .set = generic_acl_set, }; -struct xattr_handler generic_acl_default_handler = { +const struct xattr_handler generic_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = generic_acl_list, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 87ee309d4c2..9fb76b0a048 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -335,7 +335,7 @@ out: return error; } -struct xattr_handler gfs2_xattr_system_handler = { +const struct xattr_handler gfs2_xattr_system_handler = { .prefix = XATTR_SYSTEM_PREFIX, .flags = GFS2_EATYPE_SYS, .get = gfs2_xattr_system_get, diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 9306a2e6620..b522b0cb39e 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -19,6 +19,6 @@ extern int gfs2_check_acl(struct inode *inode, int mask); extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); -extern struct xattr_handler gfs2_xattr_system_handler; +extern const struct xattr_handler gfs2_xattr_system_handler; #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 8bce73ed4d8..117fa4171f6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | + BLKDEV_IFL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -869,7 +870,7 @@ start_new_extent: } if (nr_sects) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); if (rv) goto fail; } diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 3df60f2d84e..a0464680af0 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern struct xattr_handler *gfs2_xattr_handlers[]; +extern const struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index c2ebdf2c01d..82f93da00d1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1535,21 +1535,21 @@ out_alloc: return error; } -static struct xattr_handler gfs2_xattr_user_handler = { +static const struct xattr_handler gfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = GFS2_EATYPE_USR, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -static struct xattr_handler gfs2_xattr_security_handler = { +static const struct xattr_handler gfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = GFS2_EATYPE_SECURITY, .get = gfs2_xattr_get, .set = gfs2_xattr_set, }; -struct xattr_handler *gfs2_xattr_handlers[] = { +const struct xattr_handler *gfs2_xattr_handlers[] = { &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, &gfs2_xattr_system_handler, diff --git a/fs/inode.c b/fs/inode.c index 258ec22bb29..2bee20ae3d6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -286,11 +286,9 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_read(&inode->i_count)) { - atomic_inc(&inode->i_count); + if (atomic_inc_return(&inode->i_count) != 1) return; - } - atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_SYNC))) list_move(&inode->i_list, &inode_in_use); inodes_stat.nr_unused--; @@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_ino); } EXPORT_SYMBOL(init_special_inode); + +/** + * Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + mode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL(inode_init_owner); diff --git a/fs/internal.h b/fs/internal.h index 8a03a5447bd..6b706bc60a6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); +extern void __put_super(struct super_block *sb); +extern void put_super(struct super_block *sb); /* * open.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 7faefb4da93..2d140a71386 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp) if (sb->s_op->freeze_fs == NULL) return -EOPNOTSUPP; - /* If a blockdevice-backed filesystem isn't specified, return. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Freeze */ - sb = freeze_bdev(sb->s_bdev); - if (IS_ERR(sb)) - return PTR_ERR(sb); - return 0; + return freeze_super(sb); } static int ioctl_fsthaw(struct file *filp) @@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - /* Thaw */ - return thaw_bdev(sb->s_bdev, sb); + return thaw_super(sb); } /* diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 30beb11ef92..076d1cc44f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) */ if ((journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 671da7fb7ff..75716d3d2be 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -717,7 +717,8 @@ start_journal_io: if (commit_transaction->t_flushed_data_blocks && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -727,7 +728,8 @@ start_journal_io: if (err) __jbd2_journal_abort_hard(journal); if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_dev, NULL); + blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } err = journal_finish_inode_data_buffers(journal, commit_transaction); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 7cdc3196476..a33aab6b5e6 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -419,7 +419,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name, return rc; } -struct xattr_handler jffs2_acl_access_xattr_handler = { +const struct xattr_handler jffs2_acl_access_xattr_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_access_listxattr, @@ -427,7 +427,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = { .set = jffs2_acl_setxattr, }; -struct xattr_handler jffs2_acl_default_xattr_handler = { +const struct xattr_handler jffs2_acl_default_xattr_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = jffs2_acl_default_listxattr, diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index f0ba63e3c36..5e42de8d954 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); -extern struct xattr_handler jffs2_acl_access_xattr_handler; -extern struct xattr_handler jffs2_acl_default_xattr_handler; +extern const struct xattr_handler jffs2_acl_access_xattr_handler; +extern const struct xattr_handler jffs2_acl_default_xattr_handler; #else diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index eaccee05858..239f51216a6 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_security_xattr_handler = { +const struct xattr_handler jffs2_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = jffs2_security_listxattr, .set = jffs2_security_setxattr, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 9e75c62c85d..a2d58c96f1b 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) * is an implementation of setxattr handler on jffs2. * -------------------------------------------------- */ -struct xattr_handler *jffs2_xattr_handlers[] = { +const struct xattr_handler *jffs2_xattr_handlers[] = { &jffs2_user_xattr_handler, #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, @@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *xprefix_to_handler(int xprefix) { - struct xattr_handler *ret; +static const struct xattr_handler *xprefix_to_handler(int xprefix) { + const struct xattr_handler *ret; switch (xprefix) { case JFFS2_XPREFIX_USER: @@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_inode_cache *ic = f->inocache; struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; - struct xattr_handler *xhandle; + const struct xattr_handler *xhandle; ssize_t len, rc; int retry = 0; diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 6e3b5ddfb7a..cf4f5759b42 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, const char *buffer, size_t size, int flags); -extern struct xattr_handler *jffs2_xattr_handlers[]; -extern struct xattr_handler jffs2_user_xattr_handler; -extern struct xattr_handler jffs2_trusted_xattr_handler; +extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler jffs2_user_xattr_handler; +extern const struct xattr_handler jffs2_trusted_xattr_handler; extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #define jffs2_getxattr generic_getxattr @@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #ifdef CONFIG_JFFS2_FS_SECURITY extern int jffs2_init_security(struct inode *inode, struct inode *dir); -extern struct xattr_handler jffs2_security_xattr_handler; +extern const struct xattr_handler jffs2_security_xattr_handler; #else #define jffs2_init_security(inode,dir) (0) #endif /* CONFIG_JFFS2_FS_SECURITY */ diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 3e5a5e356e0..1c868194c50 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_trusted_xattr_handler = { +const struct xattr_handler jffs2_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = jffs2_trusted_listxattr, .set = jffs2_trusted_setxattr, diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 8544af67dff..916b5c96603 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list, return retlen; } -struct xattr_handler jffs2_user_xattr_handler = { +const struct xattr_handler jffs2_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .list = jffs2_user_listxattr, .set = jffs2_user_setxattr, diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 829921b6776..2686531e235 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_unlock; } - inode->i_uid = current_fsuid(); - if (parent->i_mode & S_ISGID) { - inode->i_gid = parent->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used @@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (rc) goto fail_drop; - inode->i_mode = mode; /* inherit flags from parent */ jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; @@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (S_ISLNK(mode)) jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); } - jfs_inode->mode2 |= mode; + jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 755a92e8daa..f602e230e16 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode) inode->i_mode = mode; logfs_set_ino_generation(sb, inode); - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } - + inode_init_owner(inode, dir, mode); logfs_inode_setops(inode); insert_inode_hash(inode); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 6ac693faae4..482779fe4e7 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode) clear_inode(inode); /* clear in-memory copy */ } -struct inode * minix_new_inode(const struct inode * dir, int * error) +struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); @@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) iput(inode); return NULL; } - inode->i_uid = current_fsuid(); - inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 9dcf95b4211..111f34ee9e3 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -46,7 +46,7 @@ struct minix_sb_info { extern struct inode *minix_iget(struct super_block *, unsigned long); extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); -extern struct inode * minix_new_inode(const struct inode * dir, int * error); +extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); extern int minix_new_block(struct inode * inode); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 32b131cd612..e20ee85955d 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_ if (!old_valid_dev(rdev)) return -EINVAL; - inode = minix_new_inode(dir, &error); + inode = minix_new_inode(dir, mode, &error); if (inode) { - inode->i_mode = mode; minix_set_inode(inode, rdev); mark_inode_dirty(inode); error = add_nondir(dentry, inode); @@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry, if (i > dir->i_sb->s_blocksize) goto out; - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, S_IFLNK | 0777, &err); if (!inode) goto out; - inode->i_mode = S_IFLNK | 0777; minix_set_inode(inode, 0); err = page_symlink(inode, symname, i); if (err) @@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, &err); + inode = minix_new_inode(dir, mode, &err); if (!inode) goto out_dir; - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; minix_set_inode(inode, 0); inode_inc_link_count(inode); diff --git a/fs/namei.c b/fs/namei.c index b86b96fe1dc..48e1f60520e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd) static inline void path_to_nameidata(struct path *path, struct nameidata *nd) { dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) + if (nd->path.mnt != path->mnt) { mntput(nd->path.mnt); - nd->path.mnt = path->mnt; + nd->path.mnt = path->mnt; + } nd->path.dentry = path->dentry; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 7a9ae3254a4..7e26caab2a2 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -44,8 +44,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ -static struct path rec_dir; -static int rec_dir_init = 0; +static struct file *rec_file; static int nfs4_save_creds(const struct cred **original_creds) @@ -117,33 +116,28 @@ out_no_tfm: return status; } -static void -nfsd4_sync_rec_dir(void) -{ - vfs_fsync(NULL, rec_dir.dentry, 0); -} - int nfsd4_create_clid_dir(struct nfs4_client *clp) { const struct cred *original_cred; char *dname = clp->cl_recdir; - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); - if (!rec_dir_init || clp->cl_firststate) + if (!rec_file || clp->cl_firststate) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; + dir = rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&rec_dir.dentry->d_inode->i_mutex); + mutex_lock(&dir->d_inode->i_mutex); - dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); + dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out_put; - status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); - mnt_drop_write(rec_dir.mnt); + status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_file->f_path.mnt); out_put: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); if (status == 0) { clp->cl_firststate = 1; - nfsd4_sync_rec_dir(); + vfs_fsync(rec_file, 0); } nfs4_reset_creds(original_cred); dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); @@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) struct dentry *dentry; int status; - if (!rec_dir_init) + if (!rec_file) return 0; status = nfs4_save_creds(&original_cred); if (status < 0) return status; - filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, + filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) @@ -250,13 +244,14 @@ out: static int nfsd4_unlink_clid_dir(char *name, int namlen) { - struct dentry *dentry; + struct dentry *dir, *dentry; int status; dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(name, rec_dir.dentry, namlen); + dir = rec_file->f_path.dentry; + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen) status = -ENOENT; if (!dentry->d_inode) goto out; - status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); + status = vfs_rmdir(dir->d_inode, dentry); out: dput(dentry); out_unlock: - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); + mutex_unlock(&dir->d_inode->i_mutex); return status; } @@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) const struct cred *original_cred; int status; - if (!rec_dir_init || !clp->cl_firststate) + if (!rec_file || !clp->cl_firststate) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; clp->cl_firststate = 0; @@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_creds(original_cred); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("NFSD: Failed to remove expired client state directory" @@ -323,19 +318,19 @@ void nfsd4_recdir_purge_old(void) { int status; - if (!rec_dir_init) + if (!rec_file) return; - status = mnt_want_write(rec_dir.mnt); + status = mnt_want_write(rec_file->f_path.mnt); if (status) goto out; - status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old); if (status == 0) - nfsd4_sync_rec_dir(); - mnt_drop_write(rec_dir.mnt); + vfs_fsync(rec_file, 0); + mnt_drop_write(rec_file->f_path.mnt); out: if (status) printk("nfsd4: failed to purge old clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); } static int @@ -355,10 +350,13 @@ int nfsd4_recdir_load(void) { int status; - status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); + if (!rec_file) + return 0; + + status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir); if (status) printk("nfsd4: failed loading clients from recovery" - " directory %s\n", rec_dir.dentry->d_name.name); + " directory %s\n", rec_file->f_path.dentry->d_name.name); return status; } @@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname) printk("NFSD: Using %s as the NFSv4 state recovery directory\n", rec_dirname); - BUG_ON(rec_dir_init); + BUG_ON(rec_file); status = nfs4_save_creds(&original_cred); if (status < 0) { @@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname) return; } - status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &rec_dir); - if (status) + rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0); + if (IS_ERR(rec_file)) { printk("NFSD: unable to find recovery directory %s\n", rec_dirname); + rec_file = NULL; + } - if (!status) - rec_dir_init = 1; nfs4_reset_creds(original_cred); } void nfsd4_shutdown_recdir(void) { - if (!rec_dir_init) + if (!rec_file) return; - rec_dir_init = 0; - path_put(&rec_dir); + fput(rec_file); + rec_file = NULL; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23c06f77f4c..ebbf3b6b245 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -999,7 +999,7 @@ static int wait_for_concurrent_writes(struct file *file) if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", task_pid_nr(current)); - err = vfs_fsync(file, file->f_path.dentry, 0); + err = vfs_fsync(file, 0); } last_ino = inode->i_ino; last_dev = inode->i_sb->s_dev; @@ -1175,8 +1175,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, file->f_path.dentry, - offset, end, 0); + int err2 = vfs_fsync_range(file, offset, end, 0); if (err2 != -EINVAL) err = nfserrno(err2); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 5e226d4b41d..39e038ac8fc 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode) /* reference count of i_bh inherits from nilfs_mdt_read_block() */ atomic_inc(&sbi->s_inodes_count); - - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index a756168a21c..8c1097327ab 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, - DISCARD_FL_BARRIER); + BLKDEV_IFL_BARRIER); if (ret < 0) return ret; nblocks = 0; @@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, DISCARD_FL_BARRIER); + GFP_NOFS, BLKDEV_IFL_BARRIER); return ret; } diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index 40b1cf914cc..27b75ebc746 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch); int pin_inotify_watch(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { atomic_inc(&watch->count); return 1; } - spin_unlock(&sb_lock); return 0; } @@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * done. Cleanup is just deactivate_super(). However, that leaves a messy * case - what if we *are* racing with umount() and active references to * superblock can't be acquired anymore? We can bump ->s_count, grab - * ->s_umount, which will almost certainly wait until the superblock is shut - * down and the watch in question is pining for fjords. That's fine, but - * there is a problem - we might have hit the window between ->s_active - * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock - * is past the point of no return and is heading for shutdown) and the - * moment when deactivate_super() acquires ->s_umount. We could just do - * drop_super() yield() and retry, but that's rather antisocial and this - * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having - * found that we'd got there first (i.e. that ->s_root is non-NULL) we know - * that we won't race with inotify_umount_inodes(). So we could grab a - * reference to watch and do the rest as above, just with drop_super() instead - * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we - * could grab ->s_umount. So the watch could've been gone already. - * - * That still can be dealt with - we need to save watch->wd, do idr_find() - * and compare its result with our pointer. If they match, we either have - * the damn thing still alive or we'd lost not one but two races at once, - * the watch had been killed and a new one got created with the same ->wd - * at the same address. That couldn't have happened in inotify_destroy(), - * but inotify_rm_wd() could run into that. Still, "new one got created" - * is not a problem - we have every right to kill it or leave it alone, - * whatever's more convenient. - * - * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as - * "grab it and kill it" check. If it's been our original watch, we are - * fine, if it's a newcomer - nevermind, just pretend that we'd won the - * race and kill the fscker anyway; we are safe since we know that its - * superblock won't be going away. + * ->s_umount, which will wait until the superblock is shut down and the + * watch in question is pining for fjords. * * And yes, this is far beyond mere "not very pretty"; so's the entire * concept of inotify to start with. @@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch); * Called with ih->mutex held, drops it. Possible return values: * 0 - nothing to do, it has died * 1 - remove it, drop the reference and deactivate_super() - * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid - * that variant, since it involved a lot of PITA, but that's the best that - * could've been done. */ static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; - s32 wd = watch->wd; - spin_lock(&sb_lock); - if (sb->s_count >= S_BIAS) { - atomic_inc(&sb->s_active); - spin_unlock(&sb_lock); + if (atomic_inc_not_zero(&sb->s_active)) { get_inotify_watch(watch); mutex_unlock(&ih->mutex); return 1; /* the best outcome */ } + spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ down_read(&sb->s_umount); - if (likely(!sb->s_root)) { - /* fs is already shut down; the watch is dead */ - drop_super(sb); - return 0; - } - /* raced with the final deactivate_super() */ - mutex_lock(&ih->mutex); - if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { - /* the watch is dead */ - mutex_unlock(&ih->mutex); - drop_super(sb); - return 0; - } - /* still alive or freed and reused with the same sb and wd; kill */ - get_inotify_watch(watch); - mutex_unlock(&ih->mutex); - return 2; + /* fs is already shut down; the watch is dead */ + drop_super(sb); + return 0; } -static void unpin_and_kill(struct inotify_watch *watch, int how) +static void unpin_and_kill(struct inotify_watch *watch) { struct super_block *sb = watch->inode->i_sb; put_inotify_watch(watch); - switch (how) { - case 1: - deactivate_super(sb); - break; - case 2: - drop_super(sb); - } + deactivate_super(sb); } /** @@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih) struct list_head *watches; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watches = &ih->watches; @@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih) } watch = list_first_entry(watches, struct inotify_watch, h_list); sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) continue; inode = watch->inode; @@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); } /* free this handle: the put matching the get in inotify_init() */ @@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) struct inotify_watch *watch; struct super_block *sb; struct inode *inode; - int how; mutex_lock(&ih->mutex); watch = idr_find(&ih->idr, wd); @@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) return -EINVAL; } sb = watch->inode->i_sb; - how = pin_to_kill(ih, watch); - if (!how) + if (!pin_to_kill(ih, watch)) return 0; inode = watch->inode; @@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - unpin_and_kill(watch, how); + unpin_and_kill(watch); return 0; } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e13fc9e8fcd..da702294d7e 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -489,7 +489,7 @@ cleanup: return ret; } -struct xattr_handler ocfs2_xattr_acl_access_handler = { +const struct xattr_handler ocfs2_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .list = ocfs2_xattr_list_acl_access, @@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = { .set = ocfs2_xattr_set_acl, }; -struct xattr_handler ocfs2_xattr_acl_default_handler = { +const struct xattr_handler ocfs2_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = ocfs2_xattr_list_acl_default, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index db5dd3ed4df..f171b51a74f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) inode->i_nlink = 2; else inode->i_nlink = 1; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return inode; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 98ee6c44102..e97b34842cf 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -97,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, @@ -106,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { NULL }; -static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { +static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, @@ -540,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, static inline const char *ocfs2_xattr_prefix(int name_index) { - struct xattr_handler *handler = NULL; + const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; @@ -7213,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle, xattr_ac, data_ac); } -struct xattr_handler ocfs2_xattr_security_handler = { +const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, @@ -7257,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_trusted_handler = { +const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, @@ -7313,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, name, value, size, flags); } -struct xattr_handler ocfs2_xattr_user_handler = { +const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index abd72a47f52..aa64bb37a65 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info { size_t value_len; }; -extern struct xattr_handler ocfs2_xattr_user_handler; -extern struct xattr_handler ocfs2_xattr_trusted_handler; -extern struct xattr_handler ocfs2_xattr_security_handler; -extern struct xattr_handler ocfs2_xattr_acl_access_handler; -extern struct xattr_handler ocfs2_xattr_acl_default_handler; -extern struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler ocfs2_xattr_user_handler; +extern const struct xattr_handler ocfs2_xattr_trusted_handler; +extern const struct xattr_handler ocfs2_xattr_security_handler; +extern const struct xattr_handler ocfs2_xattr_acl_access_handler; +extern const struct xattr_handler ocfs2_xattr_acl_default_handler; +extern const struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index b44bb835e8e..089839a6cc6 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode) goto fail; inode->i_ino = new_block; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/open.c b/fs/open.c index 74e5cd9f718..5463266db9e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -17,7 +17,6 @@ #include <linux/securebits.h> #include <linux/security.h> #include <linux/mount.h> -#include <linux/vfs.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <asm/uaccess.h> @@ -33,171 +32,6 @@ #include "internal.h" -int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int retval = -ENODEV; - - if (dentry) { - retval = -ENOSYS; - if (dentry->d_sb->s_op->statfs) { - memset(buf, 0, sizeof(*buf)); - retval = security_sb_statfs(dentry); - if (retval) - return retval; - retval = dentry->d_sb->s_op->statfs(dentry, buf); - if (retval == 0 && buf->f_frsize == 0) - buf->f_frsize = buf->f_bsize; - } - } - return retval; -} - -EXPORT_SYMBOL(vfs_statfs); - -static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & - 0xffffffff00000000ULL) - return -EOVERFLOW; - /* - * f_files and f_ffree may be -1; it's okay to stuff - * that into 32 bits - */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) -{ - struct kstatfs st; - int retval; - - retval = vfs_statfs(dentry, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); - else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); - } - return 0; -} - -SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) -{ - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = vfs_statfs_native(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) -{ - struct path path; - long error; - - if (sz != sizeof(*buf)) - return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = vfs_statfs64(path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } - return error; -} - -SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) -{ - struct file * file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs_native(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - -SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) -{ - struct file * file; - struct statfs64 tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs64(file->f_path.dentry, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: - return error; -} - int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index a97b477ac0f..6921e7890be 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -70,14 +70,14 @@ struct riscix_record { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -riscix_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct riscix_record *rr; - rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, §); + rr = read_part_sector(state, first_sect, §); if (!rr) return -1; @@ -123,9 +123,9 @@ struct linux_part { #if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ defined(CONFIG_ACORN_PARTITION_ADFS) -static int -linux_partition(struct parsed_partitions *state, struct block_device *bdev, - unsigned long first_sect, int slot, unsigned long nr_sects) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) { Sector sect; struct linux_part *linuxp; @@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, put_partition(state, slot++, first_sect, size); - linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, §); + linuxp = read_part_sector(state, first_sect, §); if (!linuxp) return -1; @@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev, #endif #ifdef CONFIG_ACORN_PARTITION_CUMANA -int -adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_CUMANA(struct parsed_partitions *state) { unsigned long first_sector = 0; unsigned int start_blk = 0; @@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev struct adfs_discrecord *dr; unsigned int nr_sects; - data = read_dev_sector(bdev, start_blk * 2 + 6, §); + data = read_part_sector(state, start_blk * 2 + 6, §); if (!data) return -1; @@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: /* RISCiX - we don't know how to find the next one. */ - slot = riscix_partition(state, bdev, first_sector, - slot, nr_sects); + slot = riscix_partition(state, first_sector, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, first_sector, - slot, nr_sects); + slot = linux_partition(state, first_sector, slot, + nr_sects); break; } put_dev_sector(sect); @@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev * hda1 = ADFS partition on first drive. * hda2 = non-ADFS partition. */ -int -adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ADFS(struct parsed_partitions *state) { unsigned long start_sect, nr_sects, sectscyl, heads; Sector sect; @@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) unsigned char id; int slot = 1; - data = read_dev_sector(bdev, 6, §); + data = read_part_sector(state, 6, §); if (!data) return -1; @@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) /* * Work out start of non-adfs partition. */ - nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; if (start_sect) { switch (id) { #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, bdev, start_sect, - slot, nr_sects); + slot = riscix_partition(state, start_sect, slot, + nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, bdev, start_sect, - slot, nr_sects); + slot = linux_partition(state, start_sect, slot, + nr_sects); break; } } @@ -308,10 +306,11 @@ struct ics_part { __le32 size; }; -static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) { Sector sect; - unsigned char *data = read_dev_sector(bdev, block, §); + unsigned char *data = read_part_sector(state, block, §); int result = 0; if (data) { @@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_ICS(struct parsed_partitions *state) { const unsigned char *data; const struct ics_part *p; @@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) /* * Try ICS style partitions - sector 0 contains partition info. */ - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev) * partition is. We must not make this visible * to the filesystem. */ - if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { + if (size > 1 && adfspart_check_ICSLinux(state, start)) { start += 1; size -= 1; } @@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data) * hda2 = ADFS partition 1 on first drive. * ..etc.. */ -int -adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_POWERTEC(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd int slot = 1; int i; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -508,8 +505,7 @@ static const char eesox_name[] = { * 1. The individual ADFS boot block entries that are placed on the disk. * 2. The start address of the next entry. */ -int -adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) +int adfspart_check_EESOX(struct parsed_partitions *state) { Sector sect; const unsigned char *data; @@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) sector_t start = 0; int i, slot = 1; - data = read_dev_sector(bdev, 7, §); + data = read_part_sector(state, 7, §); if (!data) return -1; @@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev) if (i != 0) { sector_t size; - size = get_capacity(bdev->bd_disk); + size = get_capacity(state->bdev->bd_disk); put_partition(state, slot++, start, size - start); printk("\n"); } diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h index 81fd50ecc08..ede82852969 100644 --- a/fs/partitions/acorn.h +++ b/fs/partitions/acorn.h @@ -7,8 +7,8 @@ * format, and everyone stick to it? */ -int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); -int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c index 9917a8c360f..ba443d4229f 100644 --- a/fs/partitions/amiga.c +++ b/fs/partitions/amiga.c @@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size) return sum; } -int -amiga_partition(struct parsed_partitions *state, struct block_device *bdev) +int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; @@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read RDB block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } @@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) } printk("Dev %s: RDB in block %d has bad checksum\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); } /* blksize is blocks per 512 byte standard block */ @@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev) put_dev_sector(sect); for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { blk *= blksize; /* Read in terms partition table understands */ - data = read_dev_sector(bdev, blk, §); + data = read_part_sector(state, blk, §); if (!data) { if (warn_no_part) printk("Dev %s: unable to read partition block %d\n", - bdevname(bdev, b), blk); + bdevname(state->bdev, b), blk); res = -1; goto rdb_done; } diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h index 2f3e9ce22d5..d094585cada 100644 --- a/fs/partitions/amiga.h +++ b/fs/partitions/amiga.h @@ -2,5 +2,5 @@ * fs/partitions/amiga.h */ -int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); +int amiga_partition(struct parsed_partitions *state); diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c index 1f3572d5b75..4439ff1b6ce 100644 --- a/fs/partitions/atari.c +++ b/fs/partitions/atari.c @@ -30,7 +30,7 @@ static inline int OK_id(char *s) memcmp (s, "RAW", 3) == 0 ; } -int atari_partition(struct parsed_partitions *state, struct block_device *bdev) +int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; @@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif - rs = (struct rootsector *) read_dev_sector(bdev, 0, §); + rs = read_part_sector(state, 0, §); if (!rs) return -1; /* Verify this is an Atari rootsector: */ - hd_size = bdev->bd_inode->i_size >> 9; + hd_size = state->bdev->bd_inode->i_size >> 9; if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && @@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev) printk(" XGM<"); partsect = extensect = be32_to_cpu(pi->st); while (1) { - xrs = (struct rootsector *)read_dev_sector(bdev, partsect, §2); + xrs = read_part_sector(state, partsect, §2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h index 63186b00e13..fe2d32a89f3 100644 --- a/fs/partitions/atari.h +++ b/fs/partitions/atari.h @@ -31,4 +31,4 @@ struct rootsector u16 checksum; /* checksum for bootable disks */ } __attribute__((__packed__)); -int atari_partition(struct parsed_partitions *state, struct block_device *bdev); +int atari_partition(struct parsed_partitions *state); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index e238ab23a9e..5dcd4b0c553 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev); int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ -static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { +static int (*check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. @@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev) struct parsed_partitions *state; int i, res, err; - state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); + state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL); if (!state) return NULL; + state->bdev = bdev; disk_name(hd, 0, state->name); printk(KERN_INFO " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); - res = check_part[i++](state, bdev); + res = check_part[i++](state); if (res < 0) { /* We have hit an I/O error which we don't report now. * But record it, and let the others do their job. @@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev) } if (res > 0) return state; + if (state->access_beyond_eod) + err = -ENOSPC; if (err) /* The partition is unrecognized. So report I/O errors if there were any */ res = err; @@ -538,12 +541,33 @@ exit: disk_part_iter_exit(&piter); } +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct parsed_partitions *state = NULL; struct disk_part_iter piter; struct hd_struct *part; - struct parsed_partitions *state; int p, highest, res; +rescan: + if (state && !IS_ERR(state)) { + kfree(state); + state = NULL; + } if (bdev->bd_part_count) return -EBUSY; @@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; - if (IS_ERR(state)) /* I/O error reading the partition table */ + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If any + * partition code tried to read beyond EOD, retry + * after unlocking native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } return -EIO; + } + /* + * If any partition code tried to read beyond EOD, try + * unlocking native capacity even if partition table is + * sucessfully read as we could be missing some partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto rescan; + } /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); @@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size, from; -try_scan: + size = state->parts[p].size; if (!size) continue; @@ -589,30 +637,21 @@ try_scan: from = state->parts[p].from; if (from >= get_capacity(disk)) { printk(KERN_WARNING - "%s: p%d ignored, start %llu is behind the end of the disk\n", + "%s: p%d start %llu is beyond EOD, ", disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + goto rescan; continue; } if (from + size > get_capacity(disk)) { - const struct block_device_operations *bdops = disk->fops; - unsigned long long capacity; - printk(KERN_WARNING - "%s: p%d size %llu exceeds device capacity, ", + "%s: p%d size %llu extends beyond EOD, ", disk->disk_name, p, (unsigned long long) size); - if (bdops->set_capacity && - (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { - printk(KERN_CONT "enabling native capacity\n"); - capacity = bdops->set_capacity(disk, ~0ULL); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - if (capacity > get_capacity(disk)) { - set_capacity(disk, capacity); - check_disk_size_change(disk, bdev); - bdev->bd_invalidated = 0; - } - goto try_scan; + if (disk_unlock_native_capacity(disk)) { + /* free state and restart */ + goto rescan; } else { /* * we can not ignore partitions of broken tables @@ -620,7 +659,6 @@ try_scan: * we limit them to the end of the disk to avoid * creating invalid block devices */ - printk(KERN_CONT "limited to end of disk\n"); size = get_capacity(disk) - from; } } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 98dbe1a8452..52f8bd39939 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -6,6 +6,7 @@ * description. */ struct parsed_partitions { + struct block_device *bdev; char name[BDEVNAME_SIZE]; struct { sector_t from; @@ -14,8 +15,19 @@ struct parsed_partitions { } parts[DISK_MAX_PARTS]; int next; int limit; + bool access_beyond_eod; }; +static inline void *read_part_sector(struct parsed_partitions *state, + sector_t n, Sector *p) +{ + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + return read_dev_sector(state->bdev, n, p); +} + static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 91babdae758..9efb2cfe241 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len) * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 -last_lba(struct block_device *bdev) +static u64 last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; @@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr) /** * read_lba(): Read bytes from disk, starting at given LBA - * @bdev + * @state * @lba * @buffer * @size_t * - * Description: Reads @count bytes from @bdev into @buffer. + * Description: Reads @count bytes from @state->bdev into @buffer. * Returns number of bytes read on success, 0 on error. */ -static size_t -read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; sector_t n = lba * (bdev_logical_block_size(bdev) / 512); - if (!bdev || !buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(bdev)) return 0; while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, n++, §); + unsigned char *data = read_part_sector(state, n++, §); if (!data) break; if (copied > count) @@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) /** * alloc_read_gpt_entries(): reads partition entries from disk - * @bdev + * @state * @gpt - GPT header * * Description: Returns ptes on success, NULL on error. * Allocates space for PTEs based on information found in @gpt. * Notes: remember to free pte when you're done! */ -static gpt_entry * -alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) { size_t count; gpt_entry *pte; - if (!bdev || !gpt) + + if (!gpt) return NULL; count = le32_to_cpu(gpt->num_partition_entries) * @@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) if (!pte) return NULL; - if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), (u8 *) pte, count) < count) { kfree(pte); @@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) /** * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk - * @bdev + * @state * @lba is the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @bdev. + * and fills a GPT header starting at @ from @state->bdev. * Note: remember to free gpt when finished with it. */ -static gpt_header * -alloc_read_gpt_header(struct block_device *bdev, u64 lba) +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(bdev); - - if (!bdev) - return NULL; + unsigned ssz = bdev_logical_block_size(state->bdev); gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; @@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) /** * is_gpt_valid() - tests one GPT header and PTEs for validity - * @bdev + * @state * @lba is the logical block address of the GPT header to test * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. @@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba) * Description: returns 1 if valid, 0 on error. * If valid, returns pointers to newly allocated GPT header and PTEs. */ -static int -is_gpt_valid(struct block_device *bdev, u64 lba, - gpt_header **gpt, gpt_entry **ptes) +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; u64 lastlba; - if (!bdev || !gpt || !ptes) + if (!ptes) return 0; - if (!(*gpt = alloc_read_gpt_header(bdev, lba))) + if (!(*gpt = alloc_read_gpt_header(state, lba))) return 0; /* Check the GUID Partition Table signature */ @@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba, goto fail; } - if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ @@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) /** * find_valid_gpt() - Search disk for valid GPT headers and PTEs - * @bdev + * @state * @gpt is a GPT header ptr, filled on return. * @ptes is a PTEs ptr, filled on return. * Description: Returns 1 if valid, 0 on error. @@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) * This protects against devices which misreport their size, and forces * the user to decide to use the Alternate GPT. */ -static int -find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) { int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; u64 lastlba; - if (!bdev || !gpt || !ptes) + + if (!ptes) return 0; - lastlba = last_lba(bdev); + lastlba = last_lba(state->bdev); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); if (legacymbr) { - read_lba(bdev, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); + read_lba(state, 0, (u8 *) legacymbr, + sizeof (*legacymbr)); good_pmbr = is_pmbr_valid(legacymbr); kfree(legacymbr); } @@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) goto fail; } - good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); if (good_pgpt) - good_agpt = is_gpt_valid(bdev, + good_agpt = is_gpt_valid(state, le64_to_cpu(pgpt->alternate_lba), &agpt, &aptes); if (!good_agpt && force_gpt) - good_agpt = is_gpt_valid(bdev, lastlba, - &agpt, &aptes); + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) @@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) } /** - * efi_partition(struct parsed_partitions *state, struct block_device *bdev) + * efi_partition(struct parsed_partitions *state) * @state - * @bdev * * Description: called from check.c, if the disk contains GPT * partitions, sets up partition entries in the kernel. @@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) * 1 if successful * */ -int -efi_partition(struct parsed_partitions *state, struct block_device *bdev) +int efi_partition(struct parsed_partitions *state) { gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(bdev) / 512; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; - if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); kfree(ptes); return 0; @@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) continue; put_partition(state, i+1, start * ssz, size * ssz); @@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) - state->parts[i+1].flags = 1; + state->parts[i + 1].flags = ADDPART_FLAG_RAID; } kfree(ptes); kfree(gpt); diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 6998b589abf..b69ab729558 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -110,7 +110,7 @@ typedef struct _legacy_mbr { } __attribute__ ((packed)) legacy_mbr; /* Functions */ -extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int efi_partition(struct parsed_partitions *state); #endif diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index fc71aab0846..3e73de5967f 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) { /* */ -int -ibm_partition(struct parsed_partitions *state, struct block_device *bdev) +int ibm_partition(struct parsed_partitions *state) { + struct block_device *bdev = state->bdev; int blocksize, res; loff_t i_size, offset, size, fmt_size; dasd_information2_t *info; @@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) /* * Get volume label, extract name and type. */ - data = read_dev_sector(bdev, info->label_block*(blocksize/512), §); + data = read_part_sector(state, info->label_block*(blocksize/512), + §); if (data == NULL) goto out_readerr; @@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) */ blk = cchhb2blk(&label->vol.vtoc, geo) + 1; counter = 0; - data = read_dev_sector(bdev, blk * (blocksize/512), - §); + data = read_part_sector(state, blk * (blocksize/512), + §); while (data != NULL) { struct vtoc_format1_label f1; @@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) || f1.DS1FMTID == _ascebc['7'] || f1.DS1FMTID == _ascebc['9']) { blk++; - data = read_dev_sector(bdev, blk * - (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); continue; } @@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) size * (blocksize >> 9)); counter++; blk++; - data = read_dev_sector(bdev, - blk * (blocksize/512), - §); + data = read_part_sector(state, + blk * (blocksize/512), §); } if (!data) diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h index 31f85a6ac45..08fb0804a81 100644 --- a/fs/partitions/ibm.h +++ b/fs/partitions/ibm.h @@ -1 +1 @@ -int ibm_partition(struct parsed_partitions *, struct block_device *); +int ibm_partition(struct parsed_partitions *); diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c index 176d89bcf12..1cc928bb762 100644 --- a/fs/partitions/karma.c +++ b/fs/partitions/karma.c @@ -9,7 +9,7 @@ #include "check.h" #include "karma.h" -int karma_partition(struct parsed_partitions *state, struct block_device *bdev) +int karma_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev) } __attribute__((packed)) *label; struct d_partition *p; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h index ecf7d3f2a3d..c764b2e9df2 100644 --- a/fs/partitions/karma.h +++ b/fs/partitions/karma.h @@ -4,5 +4,5 @@ #define KARMA_LABEL_MAGIC 0xAB56 -int karma_partition(struct parsed_partitions *state, struct block_device *bdev); +int karma_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 8652fb99e96..3ceca05b668 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, /** * ldm_validate_privheads - Compare the primary privhead with its backups - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @ph1: Memory struct to fill with ph contents * * Read and compare all three privheads from disk. @@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1, * Return: 'true' Success * 'false' Error */ -static bool ldm_validate_privheads (struct block_device *bdev, - struct privhead *ph1) +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) { static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; struct privhead *ph[3] = { ph1 }; @@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, long num_sects; int i; - BUG_ON (!bdev || !ph1); + BUG_ON (!state || !ph1); ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); @@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev, /* Read and parse privheads */ for (i = 0; i < 3; i++) { - data = read_dev_sector (bdev, - ph[0]->config_start + off[i], §); + data = read_part_sector(state, ph[0]->config_start + off[i], + §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev, } } - num_sects = bdev->bd_inode->i_size >> 9; + num_sects = state->bdev->bd_inode->i_size >> 9; if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -397,20 +397,20 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @bdev and return the parsed information into @toc1. + * @state->bdev and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * * Return: 'true' @toc1 contains validated TOCBLOCK info * 'false' @toc1 contents are undefined */ -static bool ldm_validate_tocblocks(struct block_device *bdev, - unsigned long base, struct ldmdb *ldb) +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; struct tocblock *tb[4]; @@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, int i, nr_tbs; bool result = false; - BUG_ON(!bdev || !ldb); + BUG_ON(!state || !ldb); ph = &ldb->ph; tb[0] = &ldb->toc; tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); @@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev, * skip any that fail as long as we get at least one valid TOCBLOCK. */ for (nr_tbs = i = 0; i < 4; i++) { - data = read_dev_sector(bdev, base + off[i], §); + data = read_part_sector(state, base + off[i], §); if (!data) { ldm_error("Disk read failed for TOCBLOCK %d.", i); continue; @@ -473,7 +473,7 @@ err: /** * ldm_validate_vmdb - Read the VMDB and validate it - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * @base: Offset, into @bdev, of the database * @ldb: Cache of the database structures * @@ -483,8 +483,8 @@ err: * Return: 'true' @ldb contains validated VBDB info * 'false' @ldb contents are undefined */ -static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) { Sector sect; u8 *data; @@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, struct vmdb *vm; struct tocblock *toc; - BUG_ON (!bdev || !ldb); + BUG_ON (!state || !ldb); vm = &ldb->vm; toc = &ldb->toc; - data = read_dev_sector (bdev, base + OFF_VMDB, §); + data = read_part_sector(state, base + OFF_VMDB, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -534,21 +534,21 @@ out: /** * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This function provides a weak test to decide whether the device is a dynamic * disk or not. It looks for an MS-DOS-style partition table containing at * least one partition of type 0x42 (formerly SFS, now used by Windows for * dynamic disks). * - * N.B. The only possible error can come from the read_dev_sector and that is + * N.B. The only possible error can come from the read_part_sector and that is * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @bdev is a dynamic disk - * 'false' @bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred */ -static bool ldm_validate_partition_table (struct block_device *bdev) +static bool ldm_validate_partition_table(struct parsed_partitions *state) { Sector sect; u8 *data; @@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev) int i; bool result = false; - BUG_ON (!bdev); + BUG_ON(!state); - data = read_dev_sector (bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) { ldm_crit ("Disk read failed."); return false; @@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory - * @bdev: Device holding the LDM Database - * @base: Offset, into @bdev, of the database + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) * Return: 'true' All the VBLKs were read successfully * 'false' An error occurred */ -static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, - struct ldmdb *ldb) +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) { int size, perbuf, skip, finish, s, v, recs; u8 *data = NULL; @@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, bool result = false; LIST_HEAD (frags); - BUG_ON (!bdev || !ldb); + BUG_ON(!state || !ldb); size = ldb->vm.vblk_size; perbuf = 512 / size; @@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, finish = (size * ldb->vm.last_vblk_seq) >> 9; for (s = skip; s < finish; s++) { /* For each sector */ - data = read_dev_sector (bdev, base + OFF_VMDB + s, §); + data = read_part_sector(state, base + OFF_VMDB + s, §); if (!data) { ldm_crit ("Disk read failed."); goto out; @@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh) /** * ldm_partition - Find out whether a device is a dynamic disk and handle it - * @pp: List of the partitions parsed so far - * @bdev: Device holding the LDM Database + * @state: Partition check state including device holding the LDM Database * * This determines whether the device @bdev is a dynamic disk and if so creates * the partitions necessary in the gendisk structure pointed to by @hd. @@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @bdev is a dynamic disk and we handled it - * 0 Success, @bdev is not a dynamic disk + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @bdev is a dynamic disk, but it may be corrupted + * Or @state->bdev is a dynamic disk, but it may be corrupted */ -int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) +int ldm_partition(struct parsed_partitions *state) { struct ldmdb *ldb; unsigned long base; int result = -1; - BUG_ON (!pp || !bdev); + BUG_ON(!state); /* Look for signs of a Dynamic Disk */ - if (!ldm_validate_partition_table (bdev)) + if (!ldm_validate_partition_table(state)) return 0; ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); @@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) } /* Parse and check privheads. */ - if (!ldm_validate_privheads (bdev, &ldb->ph)) + if (!ldm_validate_privheads(state, &ldb->ph)) goto out; /* Already logged */ /* All further references are relative to base (database start). */ base = ldb->ph.config_start; /* Parse and check tocs and vmdb. */ - if (!ldm_validate_tocblocks (bdev, base, ldb) || - !ldm_validate_vmdb (bdev, base, ldb)) + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) goto out; /* Already logged */ /* Initialize vblk lists in ldmdb struct */ @@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) INIT_LIST_HEAD (&ldb->v_comp); INIT_LIST_HEAD (&ldb->v_part); - if (!ldm_get_vblks (bdev, base, ldb)) { + if (!ldm_get_vblks(state, base, ldb)) { ldm_crit ("Failed to read the VBLKs from the database."); goto cleanup; } /* Finally, create the data partition devices. */ - if (ldm_create_data_partitions (pp, ldb)) { + if (ldm_create_data_partitions(state, ldb)) { ldm_debug ("Parsed LDM database successfully."); result = 1; } diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h index 30e08e809c1..d1fb50b28d8 100644 --- a/fs/partitions/ldm.h +++ b/fs/partitions/ldm.h @@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */ struct list_head v_part; }; -int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); +int ldm_partition(struct parsed_partitions *state); #endif /* _FS_PT_LDM_H_ */ diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index d4a0fad3563..74465ff7c26 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len) stg[i] = 0; } -int mac_partition(struct parsed_partitions *state, struct block_device *bdev) +int mac_partition(struct parsed_partitions *state) { int slot = 1; Sector sect; @@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ - md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, §); + md = read_part_sector(state, 0, §); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { @@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_dev_sector(bdev, secsize/512, §); + data = read_part_sector(state, secsize/512, §); if (!data) return -1; part = (struct mac_partition *) (data + secsize%512); @@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) for (blk = 1; blk <= blocks_in_map; ++blk) { int pos = blk * secsize; put_dev_sector(sect); - data = read_dev_sector(bdev, pos/512, §); + data = read_part_sector(state, pos/512, §); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); @@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) be32_to_cpu(part->block_count) * (secsize/512)); if (!strnicmp(part->type, "Linux_RAID", 10)) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the @@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); #endif put_dev_sector(sect); diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h index bbf26e1386f..3c7d9843638 100644 --- a/fs/partitions/mac.h +++ b/fs/partitions/mac.h @@ -41,4 +41,4 @@ struct mac_driver_desc { /* ... more stuff */ }; -int mac_partition(struct parsed_partitions *state, struct block_device *bdev); +int mac_partition(struct parsed_partitions *state); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 90be97f1f5a..15bfb7b1e04 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p) #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 -static int aix_magic_present(unsigned char *p, struct block_device *bdev) +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct partition *pt = (struct partition *) (p + 0x1be); Sector sect; @@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) is_extended_partition(pt)) return 0; } - d = read_dev_sector(bdev, 7, §); + d = read_part_sector(state, 7, §); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; @@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev) * only for the actual data partitions. */ -static void -parse_extended(struct parsed_partitions *state, struct block_device *bdev, - sector_t first_sector, sector_t first_size) +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size) { struct partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; int loopct = 0; /* number of links followed without finding a data partition */ int i; @@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, return; if (state->next == state->limit) return; - data = read_dev_sector(bdev, this_sector, §); + data = read_part_sector(state, this_sector, §); if (!data) return; @@ -198,9 +197,8 @@ done: /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ -static void -parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; @@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, int i; short max_nparts; - v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, §); + v = read_part_sector(state, offset + 1, §); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { @@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_bsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin, char *flavour, - int max_partitions) +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; - l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, §); + l = read_part_sector(state, offset + 1, §); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { @@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev, } #endif -static void -parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "bsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } -static void -parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "netbsd", BSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } -static void -parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL - parse_bsd(state, bdev, offset, size, origin, - "openbsd", OPENBSD_MAXPARTITIONS); + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); #endif } @@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -static void -parse_unixware(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; - l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, §); + l = read_part_sector(state, offset + 29, §); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || @@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev, * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ -static void -parse_minix(struct parsed_partitions *state, struct block_device *bdev, - sector_t offset, sector_t size, int origin) +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; @@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, struct partition *p; int i; - data = read_dev_sector(bdev, offset, §); + data = read_part_sector(state, offset, §); if (!data) return; @@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev, static struct { unsigned char id; - void (*parse)(struct parsed_partitions *, struct block_device *, - sector_t, sector_t, int); + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, @@ -417,16 +406,16 @@ static struct { {0, NULL}, }; -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) +int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(bdev) / 512; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; Sector sect; unsigned char *data; struct partition *p; struct fat_boot_sector *fb; int slot; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; if (!msdos_magic_present(data + 510)) { @@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) return 0; } - if (aix_magic_present(data, bdev)) { + if (aix_magic_present(state, data)) { put_dev_sector(sect); printk( " [AIX]"); return 0; @@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) put_partition(state, slot, start, n); printk(" <"); - parse_extended(state, bdev, start, size); + parse_extended(state, start, size); printk(" >"); continue; } put_partition(state, slot, start, size); if (SYS_IND(p) == LINUX_RAID_PARTITION) - state->parts[slot].flags = 1; + state->parts[slot].flags = ADDPART_FLAG_RAID; if (SYS_IND(p) == DM6_PARTITION) printk("[DM]"); if (SYS_IND(p) == EZD_PARTITION) @@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) if (!subtypes[n].parse) continue; - subtypes[n].parse(state, bdev, start_sect(p)*sector_size, - nr_sects(p)*sector_size, slot); + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h index 01e5e0b6902..38c781c490b 100644 --- a/fs/partitions/msdos.h +++ b/fs/partitions/msdos.h @@ -4,5 +4,5 @@ #define MSDOS_LABEL_MAGIC 0xAA55 -int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); +int msdos_partition(struct parsed_partitions *state); diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index c05c17bc5df..fc22b85d436 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,7 +10,7 @@ #include "check.h" #include "osf.h" -int osf_partition(struct parsed_partitions *state, struct block_device *bdev) +int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; @@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev) } * label; struct d_partition * partition; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h index 427b8eab314..20ed2315ec1 100644 --- a/fs/partitions/osf.h +++ b/fs/partitions/osf.h @@ -4,4 +4,4 @@ #define DISKLABELMAGIC (0x82564557UL) -int osf_partition(struct parsed_partitions *state, struct block_device *bdev); +int osf_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c index ed5ac83fe83..43b1df9aa16 100644 --- a/fs/partitions/sgi.c +++ b/fs/partitions/sgi.c @@ -27,7 +27,7 @@ struct sgi_disklabel { __be32 _unused1; /* Padding */ }; -int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) +int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; @@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) struct sgi_partition *p; char b[BDEVNAME_SIZE]; - label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; p = &label->partitions[0]; @@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h index 5d5595c0992..b9553ebdd5a 100644 --- a/fs/partitions/sgi.h +++ b/fs/partitions/sgi.h @@ -2,7 +2,7 @@ * fs/partitions/sgi.h */ -extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sgi_partition(struct parsed_partitions *state); #define SGI_LABEL_MAGIC 0x0be5a941 diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index c95e6a62c01..a32660e25f7 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -10,7 +10,7 @@ #include "check.h" #include "sun.h" -int sun_partition(struct parsed_partitions *state, struct block_device *bdev) +int sun_partition(struct parsed_partitions *state) { int i; __be16 csum; @@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) int use_vtoc; int nparts; - label = (struct sun_disklabel *)read_dev_sector(bdev, 0, §); + label = read_part_sector(state, 0, §); if (!label) return -1; @@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(bdev, b)); + bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h index 7f864d1f86d..2424baa8319 100644 --- a/fs/partitions/sun.h +++ b/fs/partitions/sun.h @@ -5,4 +5,4 @@ #define SUN_LABEL_MAGIC 0xDABE #define SUN_VTOC_SANITY 0x600DDEEE -int sun_partition(struct parsed_partitions *state, struct block_device *bdev); +int sun_partition(struct parsed_partitions *state); diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c index 4eba27b7864..9030c864428 100644 --- a/fs/partitions/sysv68.c +++ b/fs/partitions/sysv68.c @@ -46,7 +46,7 @@ struct slice { }; -int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) +int sysv68_partition(struct parsed_partitions *state) { int i, slices; int slot = 1; @@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) struct dkblk0 *b; struct slice *slice; - data = read_dev_sector(bdev, 0, §); + data = read_part_sector(state, 0, §); if (!data) return -1; @@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) i = be32_to_cpu(b->dk_ios.ios_slcblk); put_dev_sector(sect); - data = read_dev_sector(bdev, i, §); + data = read_part_sector(state, i, §); if (!data) return -1; diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h index fa733f68431..bf2f5ffa97a 100644 --- a/fs/partitions/sysv68.h +++ b/fs/partitions/sysv68.h @@ -1 +1 @@ -extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); +extern int sysv68_partition(struct parsed_partitions *state); diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c index ec852c11dce..db9eef26036 100644 --- a/fs/partitions/ultrix.c +++ b/fs/partitions/ultrix.c @@ -9,7 +9,7 @@ #include "check.h" #include "ultrix.h" -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) +int ultrix_partition(struct parsed_partitions *state) { int i; Sector sect; @@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) #define PT_MAGIC 0x032957 /* Partition magic number */ #define PT_VALID 1 /* Indicates if struct is valid */ - data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, §); + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); if (!data) return -1; diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h index a74bf8e2d37..a3cc00b2bde 100644 --- a/fs/partitions/ultrix.h +++ b/fs/partitions/ultrix.h @@ -2,4 +2,4 @@ * fs/partitions/ultrix.h */ -int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); +int ultrix_partition(struct parsed_partitions *state); diff --git a/fs/pipe.c b/fs/pipe.c index 37ba29ff315..d79872eba09 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> @@ -18,11 +19,18 @@ #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> +#include <linux/fcntl.h> #include <asm/uaccess.h> #include <asm/ioctls.h> /* + * The max size that a non-root user is allowed to grow the pipe. Can + * be set by root in /proc/sys/fs/pipe-max-pages + */ +unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; + +/* * We use a start+len construction, which provides full use of the * allocated memory. * -- Florian Coosmann (FGC) @@ -390,7 +398,7 @@ redo: if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); + curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; @@ -472,7 +480,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (PIPE_BUFFERS-1); + (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; @@ -518,8 +526,8 @@ redo1: break; } bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; char *src; @@ -580,7 +588,7 @@ redo2: if (!total_len) break; } - if (bufs < PIPE_BUFFERS) + if (bufs < pipe->buffers) continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) @@ -640,7 +648,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { count += pipe->bufs[buf].len; - buf = (buf+1) & (PIPE_BUFFERS-1); + buf = (buf+1) & (pipe->buffers - 1); } mutex_unlock(&inode->i_mutex); @@ -671,7 +679,7 @@ pipe_poll(struct file *filp, poll_table *wait) } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; /* * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -877,25 +885,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + pipe->buffers = PIPE_DEF_BUFFERS; + return pipe; + } + kfree(pipe); } - return pipe; + return NULL; } void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) buf->ops->release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); + kfree(pipe->bufs); kfree(pipe); } @@ -1094,6 +1109,89 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) } /* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + struct pipe_buffer *bufs; + + /* + * Must be a power-of-2 currently + */ + if (!is_power_of_2(arg)) + return -EINVAL; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (arg < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); + const unsigned int head = pipe->nrbufs - tail; + + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = arg; + return arg; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = file->f_path.dentry->d_inode->i_pipe; + if (!pipe) + return -EBADF; + + mutex_lock(&pipe->inode->i_mutex); + + switch (cmd) { + case F_SETPIPE_SZ: + if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) + return -EINVAL; + /* + * The pipe needs to be at least 2 pages large to + * guarantee POSIX behaviour. + */ + if (arg < 2) + return -EINVAL; + ret = pipe_set_size(pipe, arg); + break; + case F_GETPIPE_SZ: + ret = pipe->buffers; + break; + default: + ret = -EINVAL; + break; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial diff --git a/fs/quota/quota.c b/fs/quota/quota.c index cfc78826da9..ce3dfd066f5 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, return security_quotactl(cmd, type, id, sb); } +static void quota_sync_one(struct super_block *sb, void *arg) +{ + if (sb->s_qcop && sb->s_qcop->quota_sync) + sb->s_qcop->quota_sync(sb, *(int *)arg, 1); +} + static int quota_sync_all(int type) { - struct super_block *sb; int ret; if (type >= MAXQUOTAS) return -EINVAL; ret = security_quotactl(Q_SYNC, type, 0, NULL); - if (ret) - return ret; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_qcop || !sb->s_qcop->quota_sync) - continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sb->s_qcop->quota_sync(sb, type, 1); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - - return 0; + if (!ret) + iterate_supers(quota_sync_one, &type); + return ret; } static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f47cd212dee..a5ebae70dc6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = { BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, }; -struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) +struct inode *ramfs_get_inode(struct super_block *sb, + const struct inode *dir, int mode, dev_t dev) { struct inode * inode = new_inode(sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); @@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) static int ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); + struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; if (inode) { - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; @@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * struct inode *inode; int error = -ENOSPC; - inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); + inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); if (!error) { - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; d_instantiate(dentry, inode); dget(dentry); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -241,7 +233,7 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ramfs_ops; sb->s_time_gran = 1; - inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); + inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; goto fail; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1d9c12714c5..9977df9f3a5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -147,7 +147,8 @@ static int reiserfs_sync_file(struct file *filp, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index d0c43cb99ff..ee78d4a0086 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode) */ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { - - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here - */ - inode->i_uid = current_fsuid(); - inode->i_mode = mode; /* Make inode invalid - just in case we are going to drop it before * the initialization happens */ INODE_PKEY(inode)->k_objectid = 0; - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode_init_owner(inode, dir, mode); dquot_initialize(inode); return 0; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e7cc00e636d..8c4cf273c67 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -723,11 +723,11 @@ out: (handler) = *(handlers)++) /* This is the implementation for the xattr plugin infrastructure */ -static inline struct xattr_handler * -find_xattr_handler_prefix(struct xattr_handler **handlers, +static inline const struct xattr_handler * +find_xattr_handler_prefix(const struct xattr_handler **handlers, const char *name) { - struct xattr_handler *xah; + const struct xattr_handler *xah; if (!handlers) return NULL; @@ -748,7 +748,7 @@ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -767,7 +767,7 @@ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); @@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen, size_t size; if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler *reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 9cdb759645a..536d697a8a2 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_access_handler = { +const struct xattr_handler reiserfs_posix_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = posix_acl_get, @@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list, return size; } -struct xattr_handler reiserfs_posix_acl_default_handler = { +const struct xattr_handler reiserfs_posix_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = posix_acl_get, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 7271a477c04..237c6928d3c 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec) sec->value = NULL; } -struct xattr_handler reiserfs_xattr_security_handler = { +const struct xattr_handler reiserfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = security_get, .set = security_set, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5b08aaca3da..9883736ce3e 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_trusted_handler = { +const struct xattr_handler reiserfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = trusted_get, .set = trusted_set, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 75d59c49b91..45ae1a00013 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size, return len; } -struct xattr_handler reiserfs_xattr_user_handler = { +const struct xattr_handler reiserfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = user_get, .set = user_set, diff --git a/fs/splice.c b/fs/splice.c index 9313b6124a2..ac22b00d86c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, break; } - if (pipe->nrbufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, if (!--spd->nr_pages) break; - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) continue; break; @@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) page_cache_release(spd->pages[i]); } +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; @@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); + nr_pages = min(req_pages, pipe->buffers); /* * Lookup the (hopefully) full range of pages we need. */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unlock_page(page); } - pages[spd.nr_pages++] = page; + spd.pages[spd.nr_pages++] = page; index++; } @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); - page = pages[page_nr]; + page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, @@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = -ENOMEM; break; } - page_cache_release(pages[page_nr]); - pages[page_nr] = page; + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -451,8 +484,8 @@ fill_it: len = this_len; } - partial[page_nr].offset = loff; - partial[page_nr].len = this_len; + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; @@ -464,12 +497,13 @@ fill_it: * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) - page_cache_release(pages[page_nr++]); + page_cache_release(spd.pages[page_nr++]); in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) - return splice_to_pipe(pipe, &spd); + error = splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); return error; } @@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, unsigned int nr_pages; unsigned int nr_freed; size_t offset; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; pgoff_t index; ssize_t res; size_t this_len; @@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + res = -ENOMEM; + vec = __vec; + if (pipe->buffers > PIPE_DEF_BUFFERS) { + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (!vec) + goto shrink_ret; + } + index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; - pages[i] = page; + spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; @@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); - partial[i].offset = 0; - partial[i].len = this_len; + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; if (!this_len) { - __free_page(pages[i]); - pages[i] = NULL; + __free_page(spd.pages[i]); + spd.pages[i] = NULL; nr_freed++; } res -= this_len; @@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, if (res > 0) *ppos += res; +shrink_ret: + if (vec != __vec) + kfree(vec); + splice_shrink_spd(pipe, &spd); return res; err: for (i = 0; i < spd.nr_pages; i++) - __free_page(pages[i]); + __free_page(spd.pages[i]); - return error; + res = error; + goto shrink_ret; } EXPORT_SYMBOL(default_file_splice_read); @@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->inode) sd->need_wakeup = true; @@ -1211,7 +1261,7 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { @@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, int aligned) + struct partial_page *partial, int aligned, + unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, break; npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > PIPE_BUFFERS - buffers) - npages = PIPE_BUFFERS - buffers; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; error = get_user_pages_fast((unsigned long)base, npages, 0, &pages[buffers]); @@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, * or if we mapped the max number of pages that we have * room for. */ - if (error < npages || buffers == PIPE_BUFFERS) + if (error < npages || buffers == pipe_buffers) break; nr_vecs--; @@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, }; + long ret; pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, - flags & SPLICE_F_GIFT); + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, + pipe->buffers); if (spd.nr_pages <= 0) - return spd.nr_pages; + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); - return splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); + return ret; } /* @@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= PIPE_BUFFERS) { + while (pipe->nrbufs >= pipe->buffers) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1810,7 +1869,7 @@ retry: * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { /* Already processed some buffers, break */ if (ret) break; @@ -1831,7 +1890,7 @@ retry: } ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); obuf = opipe->bufs + nbuf; if (len >= ibuf->len) { @@ -1841,7 +1900,7 @@ retry: *obuf = *ibuf; ibuf->ops = NULL; opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); ipipe->nrbufs--; input_wakeup = true; } else { @@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * If we have iterated all input buffers or ran out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); /* * Get a reference to this pipe buffer, diff --git a/fs/statfs.c b/fs/statfs.c new file mode 100644 index 00000000000..4ef021f3b61 --- /dev/null +++ b/fs/statfs.c @@ -0,0 +1,196 @@ +#include <linux/syscalls.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/security.h> +#include <linux/uaccess.h> + +int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + int retval = -ENODEV; + + if (dentry) { + retval = -ENOSYS; + if (dentry->d_sb->s_op->statfs) { + memset(buf, 0, sizeof(*buf)); + retval = security_sb_statfs(dentry); + if (retval) + return retval; + retval = dentry->d_sb->s_op->statfs(dentry, buf); + if (retval == 0 && buf->f_frsize == 0) + buf->f_frsize = buf->f_bsize; + } + } + return retval; +} + +EXPORT_SYMBOL(vfs_statfs); + +static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + if (sizeof buf->f_blocks == 4) { + if ((st.f_blocks | st.f_bfree | st.f_bavail | + st.f_bsize | st.f_frsize) & + 0xffffffff00000000ULL) + return -EOVERFLOW; + /* + * f_files and f_ffree may be -1; it's okay to stuff + * that into 32 bits + */ + if (st.f_files != -1 && + (st.f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (st.f_ffree != -1 && + (st.f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) +{ + struct kstatfs st; + int retval; + + retval = vfs_statfs(dentry, &st); + if (retval) + return retval; + + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { + buf->f_type = st.f_type; + buf->f_bsize = st.f_bsize; + buf->f_blocks = st.f_blocks; + buf->f_bfree = st.f_bfree; + buf->f_bavail = st.f_bavail; + buf->f_files = st.f_files; + buf->f_ffree = st.f_ffree; + buf->f_fsid = st.f_fsid; + buf->f_namelen = st.f_namelen; + buf->f_frsize = st.f_frsize; + memset(buf->f_spare, 0, sizeof(buf->f_spare)); + } + return 0; +} + +SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) +{ + struct path path; + int error; + + error = user_path(pathname, &path); + if (!error) { + struct statfs tmp; + error = vfs_statfs_native(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) +{ + struct path path; + long error; + + if (sz != sizeof(*buf)) + return -EINVAL; + error = user_path(pathname, &path); + if (!error) { + struct statfs64 tmp; + error = vfs_statfs64(path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&path); + } + return error; +} + +SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) +{ + struct file *file; + struct statfs tmp; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs_native(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) +{ + struct file *file; + struct statfs64 tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs64(file->f_path.dentry, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +out: + return error; +} + +SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) +{ + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; + int err; + + s = user_get_super(new_decode_dev(dev)); + if (!s) + return -EINVAL; + + err = vfs_statfs(s->s_root, &sbuf); + drop_super(s); + if (err) + return err; + + memset(&tmp,0,sizeof(struct ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + + return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; +} diff --git a/fs/super.c b/fs/super.c index 1527e6a0ee3..69688b15f1f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -22,23 +22,15 @@ #include <linux/module.h> #include <linux/slab.h> -#include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/acct.h> #include <linux/blkdev.h> #include <linux/quotaops.h> -#include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> -#include <linux/syscalls.h> -#include <linux/vfs.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> -#include <linux/kobject.h> #include <linux/mutex.h> -#include <linux/file.h> #include <linux/backing-dev.h> -#include <asm/uaccess.h> #include "internal.h" @@ -93,9 +85,10 @@ static struct super_block *alloc_super(struct file_system_type *type) * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = S_BIAS; + s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); @@ -127,39 +120,14 @@ static inline void destroy_super(struct super_block *s) /* Superblock refcounting */ /* - * Drop a superblock's refcount. Returns non-zero if the superblock was - * destroyed. The caller must hold sb_lock. + * Drop a superblock's refcount. The caller must hold sb_lock. */ -static int __put_super(struct super_block *sb) +void __put_super(struct super_block *sb) { - int ret = 0; - if (!--sb->s_count) { + list_del_init(&sb->s_list); destroy_super(sb); - ret = 1; } - return ret; -} - -/* - * Drop a superblock's refcount. - * Returns non-zero if the superblock is about to be destroyed and - * at least is already removed from super_blocks list, so if we are - * making a loop through super blocks then we need to restart. - * The caller must hold sb_lock. - */ -int __put_super_and_need_restart(struct super_block *sb) -{ - /* check for race with generic_shutdown_super() */ - if (list_empty(&sb->s_list)) { - /* super block is removed, need to restart... */ - __put_super(sb); - return 1; - } - /* can't be the last, since s_list is still in use */ - sb->s_count--; - BUG_ON(sb->s_count == 0); - return 0; } /** @@ -178,57 +146,48 @@ void put_super(struct super_block *sb) /** - * deactivate_super - drop an active reference to superblock + * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * - * Drops an active reference to superblock, acquiring a temprory one if - * there is no active references left. In that case we lock superblock, + * Drops an active reference to superblock, converting it into a temprory + * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. + * + * Caller holds exclusive lock on superblock; that lock is released. */ -void deactivate_super(struct super_block *s) +void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); + if (atomic_dec_and_test(&s->s_active)) { vfs_dq_off(s, 0); - down_write(&s->s_umount); fs->kill_sb(s); put_filesystem(fs); put_super(s); + } else { + up_write(&s->s_umount); } } -EXPORT_SYMBOL(deactivate_super); +EXPORT_SYMBOL(deactivate_locked_super); /** - * deactivate_locked_super - drop an active reference to superblock + * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * - * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that - * it does not unlock it until it's all over. As the result, it's safe to - * use to dispose of new superblock on ->get_sb() failure exits - nobody - * will see the sucker until it's all over. Equivalent using up_write + - * deactivate_super is safe for that purpose only if superblock is either - * safe to use or has NULL ->s_root when we unlock. + * Variant of deactivate_locked_super(), except that superblock is *not* + * locked by caller. If we are going to drop the final active reference, + * lock will be acquired prior to that. */ -void deactivate_locked_super(struct super_block *s) +void deactivate_super(struct super_block *s) { - struct file_system_type *fs = s->s_type; - if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { - s->s_count -= S_BIAS-1; - spin_unlock(&sb_lock); - vfs_dq_off(s, 0); - fs->kill_sb(s); - put_filesystem(fs); - put_super(s); - } else { - up_write(&s->s_umount); + if (!atomic_add_unless(&s->s_active, -1, 1)) { + down_write(&s->s_umount); + deactivate_locked_super(s); } } -EXPORT_SYMBOL(deactivate_locked_super); +EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference @@ -243,22 +202,17 @@ EXPORT_SYMBOL(deactivate_locked_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + if (atomic_inc_not_zero(&s->s_active)) { + spin_unlock(&sb_lock); + return 1; + } + /* it's going away */ s->s_count++; spin_unlock(&sb_lock); + /* wait for it to die */ down_write(&s->s_umount); - if (s->s_root) { - spin_lock(&sb_lock); - if (s->s_count > S_BIAS) { - atomic_inc(&s->s_active); - s->s_count--; - spin_unlock(&sb_lock); - return 1; - } - spin_unlock(&sb_lock); - } up_write(&s->s_umount); put_super(s); - yield(); return 0; } @@ -321,8 +275,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_list); - list_del(&sb->s_instances); + list_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -357,6 +310,7 @@ retry: up_write(&s->s_umount); destroy_super(s); } + down_write(&old->s_umount); return old; } } @@ -408,11 +362,12 @@ EXPORT_SYMBOL(drop_super); */ void sync_supers(void) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); @@ -423,14 +378,43 @@ restart: up_read(&sb->s_umount); spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + __put_super(sb); } } spin_unlock(&sb_lock); } /** + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *n; + + spin_lock(&sb_lock); + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + __put_super(sb); + } + spin_unlock(&sb_lock); +} + +/** * get_super - get the superblock of a device * @bdev: device to get the superblock for * @@ -438,7 +422,7 @@ restart: * mounted on the device given. %NULL is returned if no match is found. */ -struct super_block * get_super(struct block_device *bdev) +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -448,17 +432,20 @@ struct super_block * get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); @@ -473,7 +460,7 @@ EXPORT_SYMBOL(get_super); * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active - * reference and s_umount held exclusively or %NULL if none was found. + * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { @@ -482,81 +469,49 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; +restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev != bdev) + if (list_empty(&sb->s_instances)) continue; - - sb->s_count++; - spin_unlock(&sb_lock); - down_write(&sb->s_umount); - if (sb->s_root) { - spin_lock(&sb_lock); - if (sb->s_count > S_BIAS) { - atomic_inc(&sb->s_active); - sb->s_count--; - spin_unlock(&sb_lock); + if (sb->s_bdev == bdev) { + if (grab_super(sb)) /* drops sb_lock */ return sb; - } - spin_unlock(&sb_lock); + else + goto restart; } - up_write(&sb->s_umount); - put_super(sb); - yield(); - spin_lock(&sb_lock); } spin_unlock(&sb_lock); return NULL; } -struct super_block * user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev) { struct super_block *sb; spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + /* still alive? */ if (sb->s_root) return sb; up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ + /* nope, got unmounted */ spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto rescan; + __put_super(sb); + goto rescan; } } spin_unlock(&sb_lock); return NULL; } -SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) -{ - struct super_block *s; - struct ustat tmp; - struct kstatfs sbuf; - int err = -EINVAL; - - s = user_get_super(new_decode_dev(dev)); - if (s == NULL) - goto out; - err = vfs_statfs(s->s_root, &sbuf); - drop_super(s); - if (err) - goto out; - - memset(&tmp,0,sizeof(struct ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - - err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0; -out: - return err; -} - /** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question @@ -622,24 +577,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) static void do_emergency_remount(struct work_struct *work) { - struct super_block *sb; + struct super_block *sb, *n; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { + list_for_each_entry_safe(sb, n, &super_blocks, s_list) { + if (list_empty(&sb->s_instances)) + continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { /* - * ->remount_fs needs lock_kernel(). - * * What lock protects sb->s_flags?? */ do_remount_sb(sb, MS_RDONLY, NULL, 1); } up_write(&sb->s_umount); - put_super(sb); spin_lock(&sb_lock); + __put_super(sb); } spin_unlock(&sb_lock); kfree(work); @@ -990,6 +945,96 @@ out: EXPORT_SYMBOL_GPL(vfs_kern_mount); +/** + * freeze_super -- lock the filesystem and force it into a consistent state + * @super: the super to lock + * + * Syncs the super to make sure the filesystem is consistent and calls the fs's + * freeze_fs. Subsequent calls to this without first thawing the fs will return + * -EBUSY. + */ +int freeze_super(struct super_block *sb) +{ + int ret; + + atomic_inc(&sb->s_active); + down_write(&sb->s_umount); + if (sb->s_frozen) { + deactivate_locked_super(sb); + return -EBUSY; + } + + if (sb->s_flags & MS_RDONLY) { + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + up_write(&sb->s_umount); + return 0; + } + + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + sync_filesystem(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + if (sb->s_op->freeze_fs) { + ret = sb->s_op->freeze_fs(sb); + if (ret) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + deactivate_locked_super(sb); + return ret; + } + } + up_write(&sb->s_umount); + return 0; +} +EXPORT_SYMBOL(freeze_super); + +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ +int thaw_super(struct super_block *sb) +{ + int error; + + down_write(&sb->s_umount); + if (sb->s_frozen == SB_UNFROZEN) { + up_write(&sb->s_umount); + return -EINVAL; + } + + if (sb->s_flags & MS_RDONLY) + goto out; + + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); + return error; + } + } + +out: + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + deactivate_locked_super(sb); + + return 0; +} +EXPORT_SYMBOL(thaw_super); + static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) { int err; diff --git a/fs/sync.c b/fs/sync.c index 92b228176f7..e8cbd415e50 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb(sb); + writeback_inodes_sb_locked(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); @@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb) } EXPORT_SYMBOL_GPL(sync_filesystem); +static void sync_one_sb(struct super_block *sb, void *arg) +{ + if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) + __sync_filesystem(sb, *(int *)arg); +} /* * Sync all the data for all the filesystems (called by sys_sync() and * emergency sync) - * - * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync - * is used only here. We set it against all filesystems and then clear it as - * we sync them. So redirtied filesystems are skipped. - * - * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync - * flags again, which will cause process A to resync everything. Fix that with - * a local mutex. */ static void sync_filesystems(int wait) { - struct super_block *sb; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); /* Could be down_interruptible */ - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) - sb->s_need_sync = 1; - -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync) - continue; - sb->s_need_sync = 0; - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi) - __sync_filesystem(sb, wait); - up_read(&sb->s_umount); - - /* restart only when sb is no longer on the list */ - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - mutex_unlock(&mutex); + iterate_supers(sync_one_sb, &wait); } /* @@ -190,7 +158,6 @@ EXPORT_SYMBOL(file_fsync); /** * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync - * @dentry: dentry of @file * @start: offset in bytes of the beginning of data range to sync * @end: offset in bytes of the end of data range (inclusive) * @datasync: perform only datasync @@ -198,32 +165,13 @@ EXPORT_SYMBOL(file_fsync); * Write back data in range @start..@end and metadata for @file to disk. If * @datasync is set only metadata needed to access modified file data is * written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, - loff_t end, int datasync) +int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { - const struct file_operations *fop; - struct address_space *mapping; + struct address_space *mapping = file->f_mapping; int err, ret; - /* - * Get mapping and operations from the file in case we have - * as file, or get the default values for them in case we - * don't have a struct file available. Damn nfsd.. - */ - if (file) { - mapping = file->f_mapping; - fop = file->f_op; - } else { - mapping = dentry->d_inode->i_mapping; - fop = dentry->d_inode->i_fop; - } - - if (!fop || !fop->fsync) { + if (!file->f_op || !file->f_op->fsync) { ret = -EINVAL; goto out; } @@ -235,7 +183,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = fop->fsync(file, dentry, datasync); + err = file->f_op->fsync(file, file->f_path.dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -248,19 +196,14 @@ EXPORT_SYMBOL(vfs_fsync_range); /** * vfs_fsync - perform a fsync or fdatasync on a file * @file: file to sync - * @dentry: dentry of @file * @datasync: only perform a fdatasync operation * * Write back data and metadata for @file to disk. If @datasync is * set only metadata needed to access modified file data is written. - * - * In case this function is called from nfsd @file may be %NULL and - * only @dentry is set. This can only happen when the filesystem - * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync(struct file *file, int datasync) { - return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); + return vfs_fsync_range(file, 0, LLONG_MAX, datasync); } EXPORT_SYMBOL(vfs_fsync); @@ -271,7 +214,7 @@ static int do_fsync(unsigned int fd, int datasync) file = fget(fd); if (file) { - ret = vfs_fsync(file, file->f_path.dentry, datasync); + ret = vfs_fsync(file, datasync); fput(file); } return ret; @@ -299,8 +242,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count) { if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) return 0; - return vfs_fsync_range(file, file->f_path.dentry, pos, - pos + count - 1, + return vfs_fsync_range(file, pos, pos + count - 1, (file->f_flags & __O_SYNC) ? 0 : 1); } EXPORT_SYMBOL(generic_write_sync); diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 241e9765cfa..bbd69bdb0fa 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - - inode->i_uid = current_fsuid(); + inode_init_owner(inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; inode->i_blocks = 0; @@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode) insert_inode_hash(inode); mark_inode_dirty(inode); - inode->i_mode = mode; /* for sysv_write_inode() */ sysv_write_inode(inode, 0); /* ensure inode not allocated again */ mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ /* That's it. */ diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 401e503d44a..87ebcce7221 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, */ inode->i_flags |= (S_NOCMTIME); - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - inode->i_mode = mode; + inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index fb68c9cd0c3..2b5586c7f02 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,15 +124,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) udf_updated_lvid(sb); } mutex_unlock(&sbi->s_alloc_mutex); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - inode->i_gid = current_fsgid(); - } + + inode_init_owner(inode, dir, mode); iinfo->i_location.logicalBlockNum = block; iinfo->i_location.partitionReferenceNum = diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 75816025f95..585f733615d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -579,7 +579,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; - inode->i_mode = mode; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -627,7 +626,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode, goto out; iinfo = UDF_I(inode); - inode->i_uid = current_fsuid(); init_special_inode(inode, mode, rdev); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { @@ -674,7 +672,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) goto out; err = -EIO; - inode = udf_new_inode(dir, S_IFDIR, &err); + inode = udf_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out; @@ -697,9 +695,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); brelse(fibh.sbh); - inode->i_mode = S_IFDIR | mode; - if (dir->i_mode & S_ISGID) - inode->i_mode |= S_ISGID; mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); @@ -912,7 +907,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, dquot_initialize(dir); lock_kernel(); - inode = udf_new_inode(dir, S_IFLNK, &err); + inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); if (!inode) goto out; @@ -923,7 +918,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } iinfo = UDF_I(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 230ecf60802..3a959d55084 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -303,15 +303,7 @@ cg_found: sb->s_dirt = 1; inode->i_ino = cg * uspi->s_ipg + bit; - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else - inode->i_gid = current_fsgid(); - + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; diff --git a/fs/xattr.c b/fs/xattr.c index 46f87e828b4..01bb8135e14 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix) /* * Find the xattr_handler with the matching prefix. */ -static struct xattr_handler * -xattr_resolve_name(struct xattr_handler **handlers, const char **name) +static const struct xattr_handler * +xattr_resolve_name(const struct xattr_handler **handlers, const char **name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (!*name) return NULL; @@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name) ssize_t generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) @@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; unsigned int size = 0; if (!buffer) { @@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct xattr_handler *handler; + const struct xattr_handler *handler; if (size == 0) value = ""; /* empty EA, do not remove */ @@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz int generic_removexattr(struct dentry *dentry, const char *name) { - struct xattr_handler *handler; + const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (!handler) diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index a7bc925c4d6..9f769b5b38f 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, return error; } -struct xattr_handler xfs_xattr_acl_access_handler = { +const struct xattr_handler xfs_xattr_acl_access_handler = { .prefix = POSIX_ACL_XATTR_ACCESS, .flags = ACL_TYPE_ACCESS, .get = xfs_xattr_acl_get, .set = xfs_xattr_acl_set, }; -struct xattr_handler xfs_xattr_acl_default_handler = { +const struct xattr_handler xfs_xattr_acl_default_handler = { .prefix = POSIX_ACL_XATTR_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = xfs_xattr_acl_get, diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index e9002513e08..f24dbe5efde 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -725,7 +725,8 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, + BLKDEV_IFL_WAIT); } STATIC void diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 233d4b9881b..519618e9279 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern const struct export_operations xfs_export_operations; -extern struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c index fa01b9daba6..87d3e03878c 100644 --- a/fs/xfs/linux-2.6/xfs_xattr.c +++ b/fs/xfs/linux-2.6/xfs_xattr.c @@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, (void *)value, size, xflags); } -static struct xattr_handler xfs_xattr_user_handler = { +static const struct xattr_handler xfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = 0, /* no flags implies user namespace */ .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_trusted_handler = { +static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -static struct xattr_handler xfs_xattr_security_handler = { +static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; -struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler *xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index d13eeba2c8f..0135e2a669d 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode); extern int posix_acl_access_exists(struct inode *inode); extern int posix_acl_default_exists(struct inode *inode); -extern struct xattr_handler xfs_xattr_acl_access_handler; -extern struct xattr_handler xfs_xattr_acl_default_handler; +extern const struct xattr_handler xfs_xattr_acl_access_handler; +extern const struct xattr_handler xfs_xattr_acl_default_handler; #else # define xfs_check_acl NULL # define xfs_get_acl(inode, type) NULL |