From bce41d601e58af12cee1398fe836e6b9a8fb5396 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Tue, 10 Jan 2012 15:32:29 +0200 Subject: jffs2: do not initialize variable unnecessarily Remove unnecessary initializer for a local variable. Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- fs/jffs2/erase.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a01cdad6aad..eafb8d37a6f 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -335,7 +335,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl void *ebuf; uint32_t ofs; size_t retlen; - int ret = -EIO; + int ret; unsigned long *wordebuf; ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen, -- cgit v1.2.3-18-g5258 From 803ab977618eae2b292cda0a97eed75f42250ddf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 24 Jan 2012 11:39:22 +0300 Subject: cifs: NULL dereference on allocation failure We should just return directly here, the goto causes a NULL dereference. Signed-off-by: Dan Carpenter Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 986709a8d90..026d6464335 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3857,10 +3857,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) struct smb_vol *vol_info; vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); - if (vol_info == NULL) { - tcon = ERR_PTR(-ENOMEM); - goto out; - } + if (vol_info == NULL) + return ERR_PTR(-ENOMEM); vol_info->local_nls = cifs_sb->local_nls; vol_info->linux_uid = fsuid; -- cgit v1.2.3-18-g5258 From 0863b04d1578879173aacbc5c7be749fccb70809 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 21 Jan 2012 11:02:42 -0800 Subject: kernel-doc: fix new warnings in debugfs Fix new kernel-doc warnings: Warning(fs/debugfs/file.c:556): No description found for parameter 'nregs' Warning(fs/debugfs/file.c:556): Excess function parameter 'mregs' description in 'debugfs_print_regs32' Signed-off-by: Randy Dunlap Cc: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index f65d4455c5e..ef023eef046 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob); * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures - * @mregs: the length of the above array + * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * -- cgit v1.2.3-18-g5258 From ce597919361dcec97341151690e780eade2a9cf4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 13 Jan 2012 21:32:59 -0800 Subject: sysfs: Complain bitterly about attempts to remove files from nonexistent directories. Recently an OOPS was observed from the usb serial io_ti driver when it tried to remove sysfs directories. Upon investigation it turns out this driver was always buggy and that a recent sysfs change had stopped guarding itself against removing attributes from sysfs directories that had already been removed. :( Historically we have been silent about attempting to files from nonexistent sysfs directories and have politely returned error codes. That has resulted in people writing broken code that ignores the error codes. Issue a kernel WARNING and a stack backtrace to make it clear in no uncertain terms that abusing sysfs is not ok, and the callers need to fix their code. This change transforms the io_ti OOPS into a more comprehensible error message and stack backtrace. Signed-off-by: Eric W. Biederman Reported-by: Wolfgang Frisch Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 6 ++++++ fs/sysfs/inode.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 62f4fb37789..00012e31829 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr, const void *ns = NULL; int err; + if (!dir_sd) { + WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n", + kobject_name(kobj)); + return -ENOENT; + } + err = 0; if (!sysfs_ns_type(dir_sd)) goto out; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4a802b4a905..85eb81683a2 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; - if (!dir_sd) + if (!dir_sd) { + WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n", + name); return -ENOENT; + } sysfs_addrm_start(&acxt, dir_sd); -- cgit v1.2.3-18-g5258 From b1375d64c539c5b76794be759b62d3f178e67c32 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: fix uninit warning in backref.c Added initialization with the declaration of ret. It isn't set later on the switch-default branch (which should never be taken). Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b9a843226de..633c701a287 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -297,7 +297,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; int sgn; - int ret; + int ret = 0; if (extent_op && extent_op->update_key) btrfs_disk_key_to_cpu(info_key, &extent_op->key); @@ -392,7 +392,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_key *info_key, int *info_level, struct list_head *prefs) { - int ret; + int ret = 0; int slot; struct extent_buffer *leaf; struct btrfs_key key; -- cgit v1.2.3-18-g5258 From 357b9784b79924a31ccded5d9a0c688f48cc28f2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: make sure a bitmap has enough bytes We have only been checking for min_bytes available in bitmap entries, but we won't successfully setup a bitmap cluster unless it has at least bytes in the bitmap, so in the common case min_bytes is 4k and we want something like 2MB, so if there are a bunch of bitmap entries with less than 2mb's in them, we'll search all them anyway, which is suboptimal. Fix this check. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index efe20032e4a..6e740693234 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2475,7 +2475,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } list_for_each_entry(entry, bitmaps, list) { - if (entry->bytes < min_bytes) + if (entry->bytes < bytes) continue; ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, bytes, cont1_bytes, min_bytes); -- cgit v1.2.3-18-g5258 From 6dd70ce4eb7429c2ba6dd9fa46f78a0a2a254038 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: btrfs: Fix busyloops in transaction waiting code wait_log_commit() and wait_for_writer() were using slightly different conditions for deciding whether they should call schedule() and whether they should continue in the wait loop. Thus it could happen that we busylooped when the first condition was not true while the second one was. That is burning CPU cycles needlessly and is deadly on UP machines... Signed-off-by: Jan Kara Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cb877e0886a..966cc74f5d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, finish_wait(&root->log_commit_wait[index], &wait); mutex_lock(&root->log_mutex); - } while (root->log_transid < transid + 2 && + } while (root->fs_info->last_trans_log_full_commit != + trans->transid && root->log_transid < transid + 2 && atomic_read(&root->log_commit[index])); return 0; } @@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { + while (root->fs_info->last_trans_log_full_commit != + trans->transid && atomic_read(&root->log_writers)) { prepare_to_wait(&root->log_writer_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&root->log_mutex); -- cgit v1.2.3-18-g5258 From 8bedd51b6121c4607784d75f852828d25d119c52 Mon Sep 17 00:00:00 2001 From: Mitch Harder Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: Check for NULL page in extent_range_uptodate A user has encountered a NULL pointer kernel oops in btrfs when encountering media errors. The problem has been identified as an unhandled NULL pointer returned from find_get_page(). This modification simply checks for a NULL page, and returns with an error if found (the extent_range_uptodate() function returns 1 on errors). After testing this patch, the user reported that the error with the NULL pointer oops was solved. However, there is still a remaining problem with a thread becoming stuck in wait_on_page_locked(page) in the read_extent_buffer_pages(...) function in extent_io.c for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); wait_on_page_locked(page); if (!PageUptodate(page)) ret = -EIO; } This patch leaves the issue with the locked page yet to be resolved. Signed-off-by: Mitch Harder Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9d09a4f8187..fcf77e1ded4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3909,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree, while (start <= end) { index = start >> PAGE_CACHE_SHIFT; page = find_get_page(tree->mapping, index); + if (!page) + return 1; uptodate = PageUptodate(page); page_cache_release(page); if (!uptodate) { -- cgit v1.2.3-18-g5258 From 0b4a9d248f88e6773312f262e8185f23863d984a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: use cluster->window_start when allocating from a cluster bitmap We specifically set window_start in the cluster struct to indicate where the cluster starts in a bitmap, but we've been using min_start to indicate where we're searching from. This is usually the start of the blockgroup, so essentially means we're constantly searching from the start of any bitmap we find, which completely negates all the trouble we go to in order to setup a cluster. So start using window_start to make sure we actually use the area we found. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6e740693234..61447a51f64 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2242,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, if (entry->bitmap) { ret = btrfs_alloc_from_bitmap(block_group, cluster, entry, bytes, - min_start); + cluster->window_start); if (ret == 0) { node = rb_next(&entry->offset_index); if (!node) -- cgit v1.2.3-18-g5258 From 0b485143d835c019cddc45f46e4b3873dcc9aa4e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Thu, 26 Jan 2012 15:01:11 -0500 Subject: Btrfs: fix warning for 32-bit build of fs/btrfs/check-integrity.c There have been 4 warnings on 32-bit build, they are herewith fixed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ad0b3ba735b..b669a7d8e49 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1662,7 +1662,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, &state->block_hashtable); if (NULL != block) { - u64 bytenr; + u64 bytenr = 0; struct list_head *elem_ref_to; struct list_head *tmp_ref_to; @@ -2777,9 +2777,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) printk(KERN_INFO "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu)," " size=%lu, data=%p, bdev=%p)\n", - rw, bh->b_blocknr, - (unsigned long long)dev_bytenr, bh->b_size, - bh->b_data, bh->b_bdev); + rw, (unsigned long)bh->b_blocknr, + (unsigned long long)dev_bytenr, + (unsigned long)bh->b_size, bh->b_data, + bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, bh->b_data, bh->b_size, NULL, NULL, bh, rw); @@ -2844,7 +2845,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio) printk(KERN_INFO "submit_bio(rw=0x%x, bi_vcnt=%u," " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, bio->bi_sector, + rw, bio->bi_vcnt, (unsigned long)bio->bi_sector, (unsigned long long)dev_bytenr, bio->bi_bdev); -- cgit v1.2.3-18-g5258 From 7ec31b548a17f773ab6289e795ed3a6820e8b56e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: do not defrag a file partially xfstests 218 complains that btrfs defrags a file partially: After: 1 Write backwards sync, but contiguous - should defrag to 1 extent Before: 10 -After: 1 +After: 2 To fix this, we need to set max_to_defrag count properly. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6834be4c870..0b06a5ca8af 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1066,7 +1066,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index; + max_to_defrag = last_index + 1; /* * make writeback starts from i, so the defrag range can be -- cgit v1.2.3-18-g5258 From 9e622d6bea0202e9fe267955362c01918562c09b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: fix enospc error caused by wrong checks of the chunk When we did sysbench test for inline files, enospc error happened easily though there was lots of free disk space which could be allocated for new chunks. Reproduce steps: # mkfs.btrfs -b $((2 * 1024 * 1024 * 1024)) # mount /mnt # ulimit -n 102400 # cd /mnt # sysbench --num-threads=1 --test=fileio --file-num=81920 \ > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \ > --file-test-mode=seqwr prepare # sysbench --num-threads=1 --test=fileio --file-num=81920 \ > --file-total-size=80M --file-block-size=1K --file-io-mode=sync \ > --file-test-mode=seqwr run The reason of this bug is: Now, we can reserve space which is larger than the free space in the chunks if we have enough free disk space which can be used for new chunks. By this way, the space allocator should allocate a new chunk by force if there is no free space in the free space cache. But there are two wrong checks which break this operation. One is if (ret == -ENOSPC && num_bytes > min_alloc_size) in btrfs_reserve_extent(), it is wrong, we should try to allocate a new chunk even we fail to allocate free space by minimum allocable size. The other is if (space_info->force_alloc) force = space_info->force_alloc; in do_chunk_alloc(). It makes the allocator ignore CHUNK_ALLOC_FORCE If someone sets ->force_alloc to CHUNK_ALLOC_LIMITED, and makes the enospc error happen. Fix these two wrong checks. Especially the second one, we fix it by changing the value of CHUNK_ALLOC_LIMITED and CHUNK_ALLOC_FORCE, and make CHUNK_ALLOC_FORCE greater than CHUNK_ALLOC_LIMITED since CHUNK_ALLOC_FORCE has higher priority. And if the value which is passed in by the caller is greater than ->force_alloc, use the passed value. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 49 +++++++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 700879ed64c..283af7a676a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,23 +34,24 @@ #include "locking.h" #include "free-space-cache.h" -/* control flags for do_chunk_alloc's force field +/* + * control flags for do_chunk_alloc's force field * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk * if we really need one. * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * * CHUNK_ALLOC_LIMITED means to only try and allocate one * if we have very few chunks already allocated. This is * used as part of the clustering code to help make sure * we have a good pool of storage to cluster in, without * filling the FS with empty chunks * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * */ enum { CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_FORCE = 1, - CHUNK_ALLOC_LIMITED = 2, + CHUNK_ALLOC_LIMITED = 1, + CHUNK_ALLOC_FORCE = 2, }; /* @@ -3414,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, again: spin_lock(&space_info->lock); - if (space_info->force_alloc) + if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { spin_unlock(&space_info->lock); @@ -5794,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data) { + bool final_tried = false; int ret; u64 search_start = 0; @@ -5813,22 +5815,25 @@ again: search_start, search_end, hint_byte, ins, data); - if (ret == -ENOSPC && num_bytes > min_alloc_size) { - num_bytes = num_bytes >> 1; - num_bytes = num_bytes & ~(root->sectorsize - 1); - num_bytes = max(num_bytes, min_alloc_size); - do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, CHUNK_ALLOC_FORCE); - goto again; - } - if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { - struct btrfs_space_info *sinfo; - - sinfo = __find_space_info(root->fs_info, data); - printk(KERN_ERR "btrfs allocation failed flags %llu, " - "wanted %llu\n", (unsigned long long)data, - (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes, 1); + if (ret == -ENOSPC) { + if (!final_tried) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, CHUNK_ALLOC_FORCE); + if (num_bytes == min_alloc_size) + final_tried = true; + goto again; + } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes, 1); + } } trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); -- cgit v1.2.3-18-g5258 From 0c4e538bccc106872d31b1514570b4dac95fb7f2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: btrfs: mask out gfp flags in releasepage btree_releasepage is a callback and can be passed unknown gfp flags and then they may end up in kmem_cache_alloc called from alloc_extent_state, slab allocator will BUG_ON when there is HIGHMEM or DMA32 flag set. This may happen when btrfs is mounted from a loop device, which masks out __GFP_IO flag. The check in try_release_extent_state 3399 if ((mask & GFP_NOFS) == GFP_NOFS) 3400 mask = GFP_NOFS; will not work and passes unfiltered flags further resulting in crash at mm/slab.c:2963 [<000000000024ae4c>] cache_alloc_refill+0x3b4/0x5c8 [<000000000024c810>] kmem_cache_alloc+0x204/0x294 [<00000000001fd3c2>] mempool_alloc+0x52/0x170 [<000003c000ced0b0>] alloc_extent_state+0x40/0xd4 [btrfs] [<000003c000cee5ae>] __clear_extent_bit+0x38a/0x4cc [btrfs] [<000003c000cee78c>] try_release_extent_state+0x9c/0xd4 [btrfs] [<000003c000cc4c66>] btree_releasepage+0x7e/0xd0 [btrfs] [<0000000000210d84>] shrink_page_list+0x6a0/0x724 [<0000000000211394>] shrink_inactive_list+0x230/0x578 [<0000000000211bb8>] shrink_list+0x6c/0x120 [<0000000000211e4e>] shrink_zone+0x1e2/0x228 [<0000000000211f24>] shrink_zones+0x90/0x254 [<0000000000213410>] do_try_to_free_pages+0xac/0x420 [<0000000000213ae0>] try_to_free_pages+0x13c/0x1b0 [<0000000000204e6c>] __alloc_pages_nodemask+0x5b4/0x9a8 [<00000000001fb04a>] grab_cache_page_write_begin+0x7e/0xe8 Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index da4457f84d7..4c867112b4c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -961,6 +961,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) tree = &BTRFS_I(page->mapping->host)->io_tree; map = &BTRFS_I(page->mapping->host)->extent_tree; + /* + * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing + * slab allocation from alloc_extent_state down the callchain where + * it'd hit a BUG_ON as those flags are not allowed. + */ + gfp_flags &= ~GFP_SLAB_BUG_MASK; + ret = try_release_extent_state(map, tree, page, gfp_flags); if (!ret) return 0; -- cgit v1.2.3-18-g5258 From 9b23062840e7c685ef0a0b561285d6e3a3b6811b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: Btrfs: advance window_start if we're using a bitmap If we span a long area in a bitmap we could end up taking a lot of time searching to the next free area if we're searching from the original window_start, so advance window_start in order to make sure we don't do any superficial searching. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 61447a51f64..5802b1473c3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2251,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } + cluster->window_start += bytes; } else { ret = entry->offset; -- cgit v1.2.3-18-g5258 From 9998eb703490589c3e8f1bf09b15203156776edb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 25 Jan 2012 13:47:40 -0500 Subject: Btrfs: fix reservations in btrfs_page_mkwrite Josef fixed btrfs_page_mkwrite to properly release reserved extents if there was an error. But if we fail to get a reservation and we fail to dirty the inode (for ENOSPC reasons), we'll end up trying to release a reservation we never had. This makes sure we only release if we were able to reserve. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5977987abdb..7405753ec5d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6401,18 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long zero_start; loff_t size; int ret; + int reserved = 0; u64 page_start; u64 page_end; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (!ret) + if (!ret) { ret = btrfs_update_time(vma->vm_file); + reserved = 1; + } if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; else /* -ENOSPC, -EIO, etc */ ret = VM_FAULT_SIGBUS; - goto out; + if (reserved) + goto out; + goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ @@ -6495,6 +6500,7 @@ out_unlock: unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +out_noreserve: return ret; } -- cgit v1.2.3-18-g5258 From 96150606e2fb82d242c9e4a414e4e922849f7bf7 Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Sat, 26 Nov 2011 11:00:47 +0530 Subject: logfs: update page reference count for pined pages LogFS sets PG_private flag to indicate a pined page. We assumed that marking a page as private is enough to ensure its existence. But instead it is necessary to hold a reference count to the page. The change resolves the following BUG BUG: Bad page state in process flush-253:16 pfn:6a6d0 page flags: 0x100000000000808(uptodate|private) Suggested-and-Acked-by: Joern Engel Signed-off-by: Prasad Joshi --- fs/logfs/readwrite.c | 29 ++++++++++++++++++++++------- fs/logfs/segment.c | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 51 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 2ac4217b790..6d663e8ea6d 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -560,8 +560,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block) static void indirect_free_block(struct super_block *sb, struct logfs_block *block) { - ClearPagePrivate(block->page); - block->page->private = 0; + struct page *page = block->page; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } __free_block(sb, block); } @@ -650,8 +655,11 @@ static void alloc_data_block(struct inode *inode, struct page *page) logfs_unpack_index(page->index, &bix, &level); block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); block->page = page; + SetPagePrivate(page); - page->private = (unsigned long)block; + page_cache_get(page); + set_page_private(page, (unsigned long) block); + block->ops = &indirect_block_ops; } @@ -1901,8 +1909,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page) li->li_block = block; block->page = NULL; - page->private = 0; - ClearPagePrivate(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } } static void move_inode_to_page(struct page *page, struct inode *inode) @@ -1918,8 +1929,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode) BUG_ON(PagePrivate(page)); block->ops = &indirect_block_ops; block->page = page; - page->private = (unsigned long)block; - SetPagePrivate(page); + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } block->inode = NULL; li->li_block = NULL; diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 9d518735325..6aee6092860 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, BUG_ON(!page); /* FIXME: reserve a pool */ SetPageUptodate(page); memcpy(page_address(page) + offset, buf, copylen); - SetPagePrivate(page); + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); buf += copylen; @@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area) page = get_mapping_page(sb, index, 0); BUG_ON(!page); /* FIXME: reserve a pool */ memset(page_address(page) + offset, 0xff, len); - SetPagePrivate(page); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); } } @@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area) BUG_ON(!page); /* FIXME: reserve a pool */ SetPageUptodate(page); memset(page_address(page), 0xff, PAGE_CACHE_SIZE); - SetPagePrivate(page); + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + } page_cache_release(page); index++; no_indizes--; @@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page, mempool_free(item, super->s_alias_pool); } block->page = page; - SetPagePrivate(page); - page->private = (unsigned long)block; + + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, (unsigned long) block); + } block->ops = &indirect_block_ops; initialize_block_counters(page, block, data, 0); } @@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page) list_add(&item->list, &block->item_list); } block->page = NULL; - ClearPagePrivate(page); - page->private = 0; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + set_page_private(page, 0); + } block->ops = &btree_block_ops; err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, block); @@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno) page = find_get_page(mapping, ofs >> PAGE_SHIFT); if (!page) continue; - ClearPagePrivate(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + page_cache_release(page); + } page_cache_release(page); } } -- cgit v1.2.3-18-g5258 From 934eed395d201bf0901ca0c0cc3703b18729d0ce Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Sun, 20 Nov 2011 22:29:01 +0530 Subject: logfs: Prevent memory corruption This is a bad one. I wonder whether we were so far protected by no_free_segments(sb) usually being smaller than LOGFS_NO_AREAS. Found by Dan Carpenter using smatch. Signed-off-by: Joern Engel Signed-off-by: Prasad Joshi --- fs/logfs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index caa4419285d..d4efb061bdc 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb) int i, max_dist; struct gc_candidate *cand = NULL, *this; - max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); + max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); for (i = max_dist; i >= 0; i--) { this = first_in_list(&super->s_low_list[i]); -- cgit v1.2.3-18-g5258 From 13ced29cb28996a9bc4f68e43ff0c57eafdb1e21 Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Sat, 28 Jan 2012 11:36:06 +0530 Subject: logfs: take write mutex lock during fsync and sync LogFS uses super->s_write_mutex while writing data to disk. Taking the same mutex lock in sync and fsync code path solves the following BUG: ------------[ cut here ]------------ kernel BUG at /home/prasad/logfs/dev_bdev.c:134! Pid: 2387, comm: flush-253:16 Not tainted 3.0.0+ #4 Bochs Bochs RIP: 0010:[] [] bdev_writeseg+0x25d/0x270 [logfs] Call Trace: [] logfs_open_area+0x91/0x150 [logfs] [] ? find_level.clone.9+0x62/0x100 [] __logfs_segment_write.clone.20+0x5c/0x190 [logfs] [] ? mempool_kmalloc+0x15/0x20 [] ? mempool_alloc+0x53/0x130 [] logfs_segment_write+0x1d4/0x230 [logfs] [] logfs_write_i0+0x12e/0x190 [logfs] [] __logfs_write_rec+0x140/0x220 [logfs] [] logfs_write_rec+0x64/0xd0 [logfs] [] __logfs_write_buf+0x106/0x110 [logfs] [] logfs_write_buf+0x4e/0x80 [logfs] [] __logfs_writepage+0x23/0x80 [logfs] [] logfs_writepage+0xdc/0x110 [logfs] [] __writepage+0x17/0x40 [] write_cache_pages+0x208/0x4f0 [] ? set_page_dirty+0x70/0x70 [] generic_writepages+0x4a/0x70 [] do_writepages+0x21/0x40 [] writeback_single_inode+0x101/0x250 [] writeback_sb_inodes+0xed/0x1c0 [] writeback_inodes_wb+0x7b/0x1e0 [] wb_writeback+0x4c3/0x530 [] ? sub_preempt_count+0x9d/0xd0 [] wb_do_writeback+0xdb/0x290 [] ? sub_preempt_count+0x9d/0xd0 [] ? _raw_spin_unlock_irqrestore+0x18/0x40 [] ? del_timer+0x8a/0x120 [] bdi_writeback_thread+0x8c/0x2e0 [] ? wb_do_writeback+0x290/0x290 [] kthread+0x96/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_worker_fn+0x190/0x190 [] ? gs_change+0xb/0xb RIP [] bdev_writeseg+0x25d/0x270 [logfs] ---[ end trace 0211ad60a57657c4 ]--- Reviewed-by: Joern Engel Signed-off-by: Prasad Joshi --- fs/logfs/file.c | 2 ++ fs/logfs/inode.c | 2 ++ fs/logfs/logfs.h | 2 ++ fs/logfs/readwrite.c | 6 ++---- 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/logfs/file.c b/fs/logfs/file.c index b548c87a86f..3886cded283 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; mutex_lock(&inode->i_mutex); + logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); mutex_unlock(&inode->i_mutex); return 0; diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 7e441ad5f79..388d7c5a7be 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -364,7 +364,9 @@ static void logfs_init_once(void *_li) static int logfs_sync_fs(struct super_block *sb, int wait) { + logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); + logfs_put_wblocks(sb, NULL, WF_LOCK); return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 398ecff6e54..bb4340850c1 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block, __be64 *array, int page_is_empty); int logfs_exist_block(struct inode *inode, u64 bix); int get_page_reserve(struct inode *inode, struct page *page); +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); extern struct logfs_block_ops indirect_block_ops; /* segment.c */ diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 6d663e8ea6d..7b10e8aecce 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock) * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked * in addition to PG_locked. */ -static void logfs_get_wblocks(struct super_block *sb, struct page *page, - int lock) +void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock) { struct logfs_super *super = logfs_super(sb); @@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page, } } -static void logfs_put_wblocks(struct super_block *sb, struct page *page, - int lock) +void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock) { struct logfs_super *super = logfs_super(sb); -- cgit v1.2.3-18-g5258 From ecfd890991a26e70547e025673580923a004c5e4 Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Sun, 30 Oct 2011 22:15:32 +0530 Subject: logfs: set superblock shutdown flag after generic sb shutdown While unmounting the file system LogFS calls generic_shutdown_super. The function does file system independent superblock shutdown. However, it might result in call file system specific inode eviction. LogFS marks FS shutting down by setting bit LOGFS_SB_FLAG_SHUTDOWN in super->s_flags. Since, inode eviction might call truncate on inode, following BUG is observed when file system is unmounted: ------------[ cut here ]------------ kernel BUG at /home/prasad/logfs/segment.c:362! invalid opcode: 0000 [#1] PREEMPT SMP CPU 3 Modules linked in: logfs binfmt_misc ppdev virtio_blk parport_pc lp parport psmouse floppy virtio_pci serio_raw virtio_ring virtio Pid: 1933, comm: umount Not tainted 3.0.0+ #4 Bochs Bochs RIP: 0010:[] [] logfs_segment_write+0x211/0x230 [logfs] RSP: 0018:ffff880062d7b9e8 EFLAGS: 00010202 RAX: 000000000000000e RBX: ffff88006eca9000 RCX: 0000000000000000 RDX: ffff88006fd87c40 RSI: ffffea00014ff468 RDI: ffff88007b68e000 RBP: ffff880062d7ba48 R08: 8000000020451430 R09: 0000000000000000 R10: dead000000100100 R11: 0000000000000000 R12: ffff88006fd87c40 R13: ffffea00014ff468 R14: ffff88005ad0a460 R15: 0000000000000000 FS: 00007f25d50ea760(0000) GS:ffff88007fd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000d05e48 CR3: 0000000062c72000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process umount (pid: 1933, threadinfo ffff880062d7a000, task ffff880070b44500) Stack: ffff880062d7ba38 ffff88005ad0a508 0000000000001000 0000000000000000 8000000020451430 ffffea00014ff468 ffff880062d7ba48 ffff88005ad0a460 ffff880062d7bad8 ffffea00014ff468 ffff88006fd87c40 0000000000000000 Call Trace: [] logfs_write_i0+0x12e/0x190 [logfs] [] __logfs_write_rec+0x140/0x220 [logfs] [] __logfs_write_rec+0xf2/0x220 [logfs] [] logfs_write_rec+0x64/0xd0 [logfs] [] __logfs_write_buf+0x106/0x110 [logfs] [] logfs_write_buf+0x4e/0x80 [logfs] [] __logfs_write_inode+0x98/0x110 [logfs] [] logfs_truncate+0x54/0x290 [logfs] [] logfs_evict_inode+0xdc/0x190 [logfs] [] evict+0x85/0x170 [] iput+0xe6/0x1b0 [] shrink_dcache_for_umount_subtree+0x218/0x280 [] shrink_dcache_for_umount+0x51/0x90 [] generic_shutdown_super+0x2c/0x100 [] logfs_kill_sb+0x57/0xf0 [logfs] [] deactivate_locked_super+0x45/0x70 [] deactivate_super+0x4a/0x70 [] mntput_no_expire+0xa4/0xf0 [] sys_umount+0x6f/0x380 [] system_call_fastpath+0x16/0x1b Code: 55 c8 49 8d b6 a8 00 00 00 45 89 f9 45 89 e8 4c 89 e1 4c 89 55 b8 c7 04 24 00 00 00 00 e8 68 fc ff ff 4c 8b 55 b8 e9 3c ff ff ff <0f> 0b 0f 0b c7 45 c0 00 00 00 00 e9 44 fe ff ff 66 66 66 66 66 RIP [] logfs_segment_write+0x211/0x230 [logfs] RSP ---[ end trace fe6b040cea952290 ]--- Therefore, move super->s_flags setting after the fs-indenpendent work has been finished. Reviewed-by: Joern Engel Signed-off-by: Prasad Joshi --- fs/logfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index e795c234ea3..f9b7a30b00a 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -491,9 +491,9 @@ static void logfs_kill_sb(struct super_block *sb) * From this point on alias entries are simply dropped - and any * writes to the object store are considered bugs. */ - super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; log_super("LogFS: Now in shutdown\n"); generic_shutdown_super(sb); + super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); -- cgit v1.2.3-18-g5258 From 0bd90387ed5a8abbcf43391b480efdc211721cfe Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Sun, 2 Oct 2011 23:46:51 +0530 Subject: logfs: Propagate page parameter to __logfs_write_inode During GC LogFS has to rewrite each valid block to a separate segment. Rewrite operation reads data from an old segment and writes it to a newly allocated segment. Since every write operation changes data block pointers maintained in inode, inode should also be rewritten. In GC path to avoid AB-BA deadlock LogFS marks a page with PG_pre_locked in addition to locking the page (PG_locked). The page lock is ignored iff the page is pre-locked. LogFS uses a special file called segment file. The segment file maintains an 8 bytes entry for every segment. It keeps track of erase count, level etc. for every segment. Bad things happen with a segment belonging to the segment file is GCed ------------[ cut here ]------------ kernel BUG at /home/prasad/logfs/readwrite.c:297! invalid opcode: 0000 [#1] SMP Modules linked in: logfs joydev usbhid hid psmouse e1000 i2c_piix4 serio_raw [last unloaded: logfs] Pid: 20161, comm: mount Not tainted 3.1.0-rc3+ #3 innotek GmbH VirtualBox EIP: 0060:[] EFLAGS: 00010292 CPU: 0 EIP is at logfs_lock_write_page+0x6a/0x70 [logfs] EAX: 00000027 EBX: f73f5b20 ECX: c16007c8 EDX: 00000094 ESI: 00000000 EDI: e59be6e4 EBP: c7337b28 ESP: c7337b18 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process mount (pid: 20161, ti=c7336000 task=eb323f70 task.ti=c7336000) Stack: f8099a3d c7337b24 f73f5b20 00001002 c7337b50 f8091f6d f8099a4d f80994e4 00000003 00000000 c7337b68 00000000 c67e4400 00001000 c7337b80 f80935e5 00000000 00000000 00000000 00000000 e1fcf000 0000000f e59be618 c70bf900 Call Trace: [] logfs_get_write_page.clone.16+0xdd/0x100 [logfs] [] logfs_mod_segment_entry+0x55/0x110 [logfs] [] logfs_get_segment_entry+0x1d/0x20 [logfs] [] ? logfs_cleanup_journal+0x50/0x50 [logfs] [] ostore_get_erase_count+0x1b/0x40 [logfs] [] logfs_open_area+0xc8/0x150 [logfs] [] ? kmemleak_alloc+0x2c/0x60 [] __logfs_segment_write.clone.16+0x4e/0x1b0 [logfs] [] ? mempool_kmalloc+0x13/0x20 [] ? mempool_kmalloc+0x13/0x20 [] logfs_segment_write+0x17f/0x1d0 [logfs] [] logfs_write_i0+0x11c/0x180 [logfs] [] logfs_write_direct+0x45/0x90 [logfs] [] __logfs_write_buf+0xbd/0xf0 [logfs] [] ? kmap_atomic_prot+0x4e/0xe0 [] logfs_write_buf+0x3b/0x60 [logfs] [] __logfs_write_inode+0xa9/0x110 [logfs] [] logfs_rewrite_block+0xc0/0x110 [logfs] [] ? get_mapping_page+0x10/0x60 [logfs] [] ? logfs_load_object_aliases+0x2e0/0x2f0 [logfs] [] logfs_gc_segment+0x2ad/0x310 [logfs] [] __logfs_gc_once+0x4a/0x80 [logfs] [] logfs_gc_pass+0x683/0x6a0 [logfs] [] logfs_mount+0x5a9/0x680 [logfs] [] mount_fs+0x21/0xd0 [] ? __alloc_percpu+0xf/0x20 [] ? alloc_vfsmnt+0xb1/0x130 [] vfs_kern_mount+0x4b/0xa0 [] do_kern_mount+0x3e/0xe0 [] do_mount+0x34d/0x670 [] ? strndup_user+0x49/0x70 [] sys_mount+0x6b/0xa0 [] syscall_call+0x7/0xb Code: f8 e8 8b 93 39 c9 8b 45 f8 3e 0f ba 28 00 19 d2 85 d2 74 ca eb d0 0f 0b 8d 45 fc 89 44 24 04 c7 04 24 3d 9a 09 f8 e8 09 92 39 c9 <0f> 0b 8d 74 26 00 55 89 e5 3e 8d 74 26 00 8b 10 80 e6 01 74 09 EIP: [] logfs_lock_write_page+0x6a/0x70 [logfs] SS:ESP 0068:c7337b18 ---[ end trace 96e67d5b3aa3d6ca ]--- The patch passes locked page to __logfs_write_inode. It calls function logfs_get_wblocks() to pre-lock the page. This ensures any further attempts to lock the page are ignored (esp from get_erase_count). Acked-by: Joern Engel Signed-off-by: Prasad Joshi --- fs/logfs/dir.c | 2 +- fs/logfs/inode.c | 2 +- fs/logfs/logfs.h | 2 +- fs/logfs/readwrite.c | 12 ++++++------ 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index b7d7f67cee5..b6404898da8 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, static int write_inode(struct inode *inode) { - return __logfs_write_inode(inode, WF_LOCK); + return __logfs_write_inode(inode, NULL, WF_LOCK); } static s64 dir_seek_data(struct inode *inode, s64 pos) diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 388d7c5a7be..7c42c132c17 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -287,7 +287,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) return 0; - ret = __logfs_write_inode(inode, flags); + ret = __logfs_write_inode(inode, NULL, flags); LOGFS_BUG_ON(ret, inode->i_sb); return ret; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index bb4340850c1..0dec29887a8 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void); void logfs_set_blocks(struct inode *inode, u64 no); /* these logically belong into inode.c but actually reside in readwrite.c */ int logfs_read_inode(struct inode *inode); -int __logfs_write_inode(struct inode *inode, long flags); +int __logfs_write_inode(struct inode *inode, struct page *, long flags); void logfs_evict_inode(struct inode *inode); /* journal.c */ diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 7b10e8aecce..88284c67ba9 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -422,7 +422,7 @@ static void inode_write_block(struct logfs_block *block) if (inode->i_ino == LOGFS_INO_MASTER) logfs_write_anchor(inode->i_sb); else { - ret = __logfs_write_inode(inode, 0); + ret = __logfs_write_inode(inode, NULL, 0); /* see indirect_write_block comment */ BUG_ON(ret); } @@ -1629,7 +1629,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, if (inode->i_ino == LOGFS_INO_MASTER) logfs_write_anchor(inode->i_sb); else { - err = __logfs_write_inode(inode, flags); + err = __logfs_write_inode(inode, page, flags); } } } @@ -1879,7 +1879,7 @@ int logfs_truncate(struct inode *inode, u64 target) logfs_get_wblocks(sb, NULL, 1); err = __logfs_truncate(inode, size); if (!err) - err = __logfs_write_inode(inode, 0); + err = __logfs_write_inode(inode, NULL, 0); logfs_put_wblocks(sb, NULL, 1); } @@ -2119,14 +2119,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) ec_level); } -int __logfs_write_inode(struct inode *inode, long flags) +int __logfs_write_inode(struct inode *inode, struct page *page, long flags) { struct super_block *sb = inode->i_sb; int ret; - logfs_get_wblocks(sb, NULL, flags & WF_LOCK); + logfs_get_wblocks(sb, page, flags & WF_LOCK); ret = do_write_inode(inode); - logfs_put_wblocks(sb, NULL, flags & WF_LOCK); + logfs_put_wblocks(sb, page, flags & WF_LOCK); return ret; } -- cgit v1.2.3-18-g5258 From 6c69494f6b442834f26377e02d43fc8e1272221d Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Mon, 12 Sep 2011 21:09:16 +0530 Subject: logfs: remove useless BUG_ON It prevents write sizes >4k. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 9da29706f91..1e1c369df22 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, if (len == 0) return logfs_write_header(super, header, 0, type); - BUG_ON(len > sb->s_blocksize); compr_len = logfs_compress(buf, data, len, sb->s_blocksize); if (compr_len < 0 || type == JE_ANCHOR) { memcpy(data, buf, len); -- cgit v1.2.3-18-g5258 From 1bcceaff8cbe5e5698ccf1015c9a938aa72718c4 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 5 Aug 2011 11:18:19 +0200 Subject: logfs: Free areas before calling generic_shutdown_super() Or hit an assertion in map_invalidatepage() instead. Signed-off-by: Joern Engel --- fs/logfs/logfs.h | 1 + fs/logfs/segment.c | 14 ++++++++++---- fs/logfs/super.c | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 0dec29887a8..59ed32cd62d 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -596,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb); void logfs_sync_area(struct logfs_area *area); void logfs_sync_segments(struct super_block *sb); void freeseg(struct super_block *sb, u32 segno); +void free_areas(struct super_block *sb); /* area handling */ int logfs_init_areas(struct super_block *sb); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 6aee6092860..ab798ed1cc8 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -862,6 +862,16 @@ static void free_area(struct logfs_area *area) kfree(area); } +void free_areas(struct super_block *sb) +{ + struct logfs_super *super = logfs_super(sb); + int i; + + for_each_area(i) + free_area(super->s_area[i]); + free_area(super->s_journal_area); +} + static struct logfs_area *alloc_area(struct super_block *sb) { struct logfs_area *area; @@ -944,10 +954,6 @@ err: void logfs_cleanup_areas(struct super_block *sb) { struct logfs_super *super = logfs_super(sb); - int i; btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); - for_each_area(i) - free_area(super->s_area[i]); - free_area(super->s_journal_area); } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index f9b7a30b00a..c9ee7f5d1ca 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -486,6 +486,7 @@ static void logfs_kill_sb(struct super_block *sb) /* Alias entries slow down mount, so evict as many as possible */ sync_filesystem(sb); logfs_write_anchor(sb); + free_areas(sb); /* * From this point on alias entries are simply dropped - and any -- cgit v1.2.3-18-g5258 From bbe01387129f76fa4bec17904eb14c4bdc3c179f Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 5 Aug 2011 11:13:30 +0200 Subject: logfs: Grow inode in delete path Can be necessary if an inode gets deleted (through -ENOSPC) before being written. Might be better to move this into logfs_write_rec(), but for now go with the stupid&safe patch. Signed-off-by: Joern Engel --- fs/logfs/readwrite.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 88284c67ba9..4153e65b014 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1576,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags) static int __logfs_delete(struct inode *inode, struct page *page) { long flags = WF_DELETE; + int err; inode->i_ctime = inode->i_mtime = CURRENT_TIME; if (page->index < I0_BLOCKS) return logfs_write_direct(inode, page, flags); + err = grow_inode(inode, page->index, 0); + if (err) + return err; return logfs_write_rec(inode, page, page->index, 0, flags); } -- cgit v1.2.3-18-g5258 From f2933e86ad93a8d1287079d59e67afd6f4166a9d Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 5 Aug 2011 11:09:55 +0200 Subject: Logfs: Allow NULL block_isbad() methods Not all mtd drivers define block_isbad(). Let's assume no bad blocks instead of refusing to mount. Signed-off-by: Joern Engel --- fs/logfs/dev_mtd.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 339e17e9133..d054d7e975c 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -150,14 +150,13 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) filler_t *filler = mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) - return NULL; - *ofs = 0; - while (mtd->block_isbad(mtd, *ofs)) { - *ofs += mtd->erasesize; - if (*ofs >= mtd->size) - return NULL; + if (mtd->block_isbad) { + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } } BUG_ON(*ofs & ~PAGE_MASK); return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); @@ -170,14 +169,13 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) filler_t *filler = mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd->block_isbad) - return NULL; - *ofs = mtd->size - mtd->erasesize; - while (mtd->block_isbad(mtd, *ofs)) { - *ofs -= mtd->erasesize; - if (*ofs <= 0) - return NULL; + if (mtd->block_isbad) { + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } } *ofs = *ofs + mtd->erasesize - 0x1000; BUG_ON(*ofs & ~PAGE_MASK); -- cgit v1.2.3-18-g5258 From 4991a5faab7368daac463181e786608b4eb63675 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 31 Jan 2012 11:52:01 +0300 Subject: cifs: check offset in decode_ntlmssp_challenge() We should check that we're not copying memory from beyond the end of the blob. Signed-off-by: Dan Carpenter Reviewed-by: Jeff Layton --- fs/cifs/sess.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d85efad5765..eb767412177 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -395,6 +395,10 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset); tilen = le16_to_cpu(pblob->TargetInfoArray.Length); + if (tioffset > blob_len || tioffset + tilen > blob_len) { + cERROR(1, "tioffset + tilen too high %u + %u", tioffset, tilen); + return -EINVAL; + } if (tilen) { ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); if (!ses->auth_key.response) { -- cgit v1.2.3-18-g5258 From 000f9bb83968ebd6959ff76870f16fc8f766ebd3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 30 Jan 2012 19:50:01 -0800 Subject: cifs: fix printk format warnings Fix printk format warnings for ssize_t variables: fs/cifs/connect.c:2145:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t' fs/cifs/connect.c:2152:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t' fs/cifs/connect.c:2160:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t' fs/cifs/connect.c:2170:3: warning: format '%ld' expects type 'long int', but argument 3 has type 'ssize_t' Signed-off-by: Randy Dunlap Acked-by: Jeff Layton Cc: linux-cifs@vger.kernel.org --- fs/cifs/connect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 026d6464335..9c288653e6d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2142,14 +2142,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) len = delim - payload; if (len > MAX_USERNAME_SIZE || len <= 0) { - cFYI(1, "Bad value from username search (len=%ld)", len); + cFYI(1, "Bad value from username search (len=%zd)", len); rc = -EINVAL; goto out_key_put; } vol->username = kstrndup(payload, len, GFP_KERNEL); if (!vol->username) { - cFYI(1, "Unable to allocate %ld bytes for username", len); + cFYI(1, "Unable to allocate %zd bytes for username", len); rc = -ENOMEM; goto out_key_put; } @@ -2157,7 +2157,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) len = key->datalen - (len + 1); if (len > MAX_PASSWORD_SIZE || len <= 0) { - cFYI(1, "Bad len for password search (len=%ld)", len); + cFYI(1, "Bad len for password search (len=%zd)", len); rc = -EINVAL; kfree(vol->username); vol->username = NULL; @@ -2167,7 +2167,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) ++delim; vol->password = kstrndup(delim, len, GFP_KERNEL); if (!vol->password) { - cFYI(1, "Unable to allocate %ld bytes for password", len); + cFYI(1, "Unable to allocate %zd bytes for password", len); rc = -ENOMEM; kfree(vol->username); vol->username = NULL; -- cgit v1.2.3-18-g5258 From 4505360376637832f79f84f352588b0a045ad113 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Fri, 27 Jan 2012 06:37:26 +0000 Subject: xfs: pass KM_SLEEP flag to kmem_realloc() in xlog_recover_add_to_cnt_trans() The kmem_realloc() in xfs is given KM_* memory allocation flags. And it allocates memory using kmalloc() after they are converted to gfp_mask flags. In xlog_recover_add_to_cont_trans(), 0u is passed to kmem_realloc(), instead of them. I guess it is preferred to use them, and here memory must be allocated but don't have to be done with GFP_ATOMIC. So, this patch changes it to KM_SLEEP. Signed-off-by: Mitsuo Hayasaka Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 541a508adea..15ff5392fb6 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1489,7 +1489,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; -- cgit v1.2.3-18-g5258 From 2a73ca8208197d03f78d680b3c7953b897e91eb6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 31 Jan 2012 12:51:24 -0600 Subject: [CIFS] Update cifs Kconfig title to match removal of experimental dependency Removed the dependency on CONFIG_EXPERIMENTAL but forgot to update the text description to be consistent. Signed-off-by: Steve French --- fs/cifs/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 0554b00a7b3..2b243af70aa 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -139,7 +139,7 @@ config CIFS_DFS_UPCALL points. If unsure, say N. config CIFS_FSCACHE - bool "Provide CIFS client caching support (EXPERIMENTAL)" + bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y help Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data @@ -147,7 +147,7 @@ config CIFS_FSCACHE manager. If unsure, say N. config CIFS_ACL - bool "Provide CIFS ACL support (EXPERIMENTAL)" + bool "Provide CIFS ACL support" depends on CIFS_XATTR && KEYS help Allows to fetch CIFS/NTFS ACL from the server. The DACL blob -- cgit v1.2.3-18-g5258 From d98456fcafa6f3fd1985f9b7429aaa3531c6bfa0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 31 Jan 2012 20:27:41 -0500 Subject: Btrfs: don't reserve data with extents locked in btrfs_fallocate btrfs_fallocate tries to allocate space only if ranges in the file don't already exist. But the enospc checks it does are not allowed with extents locked. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f61e11a299..0621a3a7d5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1603,6 +1603,14 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; + /* + * Make sure we have enough space before we do the + * allocation. + */ + ret = btrfs_check_data_free_space(inode, len); + if (ret) + return ret; + /* * wait for ordered IO before we have any locks. We'll loop again * below with the locks held. @@ -1666,27 +1674,12 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - - /* - * Make sure we have enough space before we do the - * allocation. - */ - ret = btrfs_check_data_free_space(inode, last_byte - - cur_offset); - if (ret) { - free_extent_map(em); - break; - } - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, last_byte - cur_offset, 1 << inode->i_blkbits, offset + len, &alloc_hint); - /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, last_byte - - cur_offset); if (ret < 0) { free_extent_map(em); break; @@ -1714,6 +1707,8 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + /* Let go of our reservation. */ + btrfs_free_reserved_data_space(inode, len); return ret; } -- cgit v1.2.3-18-g5258 From 15eb77a07c714ac80201abd0a9568888bcee6276 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 17 Jan 2012 11:18:56 -0600 Subject: writeback: fix NULL bdi->dev in trace writeback_single_inode bdi_prune_sb() resets sb->s_bdi to default_backing_dev_info when the tearing down the original bdi. Fix trace_writeback_single_inode to use sb->s_bdi=default_backing_dev_info rather than bdi->dev=NULL for a teared down bdi. Cc: Reported-by: Rabin Vincent Tested-by: Rabin Vincent Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f855916657b..5b4a9362d5a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -52,14 +52,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -/* - * Include the creation of the trace points after defining the - * wb_writeback_work structure so that the definition remains local to this - * file. - */ -#define CREATE_TRACE_POINTS -#include - /* * We don't actually have pdflush, but this one is exported though /proc... */ @@ -92,6 +84,14 @@ static inline struct inode *wb_inode(struct list_head *head) return list_entry(head, struct inode, i_wb_list); } +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure and inline functions so that the definition + * remains local to this file. + */ +#define CREATE_TRACE_POINTS +#include + /* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ static void bdi_wakeup_flusher(struct backing_dev_info *bdi) { -- cgit v1.2.3-18-g5258 From 7d731019218e49a9811f6d0adec4b1cfcb752bed Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 1 Feb 2012 11:10:24 -0800 Subject: mtd: fix merge conflict resolution breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes merge conflict resolution breakage introduced by merge d3712b9dfcf4 ("Merge tag 'for-linus' of git://github.com/prasad-joshi/logfs_upstream"). The commit changed 'mtd_can_have_bb()' function and made it always return zero, which is incorrect. Instead, we need it to return whether the underlying flash device can have bad eraseblocks or not. UBI needs this information because it affects how it handles the underlying flash. E.g., if the underlying flash is NOR, it cannot have bad blocks and any write or erase error is fatal, and all we can do is to switch to R/O mode. We do not need to reserve a pool of good eraseblocks for bad eraseblocks handling, and so on. This patch also removes 'mtd_can_have_bb()' invocations from Logfs to ensure correct Logfs behavior. I've tested that with this patch UBI works on top of NOR and NAND flashes emulated by mtdram and nandsim correspondingly. This patch is based on patch from Linus Torvalds. Signed-off-by: Artem Bityutskiy Acked-by: Jörn Engel Acked-by: Prasad Joshi Acked-by: Brian Norris Signed-off-by: Linus Torvalds --- fs/logfs/dev_mtd.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index e97404d611e..9c501449450 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -152,9 +152,6 @@ static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs) filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd_can_have_bb(mtd)) - return NULL; - *ofs = 0; while (mtd_block_isbad(mtd, *ofs)) { *ofs += mtd->erasesize; @@ -172,9 +169,6 @@ static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs) filler_t *filler = logfs_mtd_readpage; struct mtd_info *mtd = super->s_mtd; - if (!mtd_can_have_bb(mtd)) - return NULL; - *ofs = mtd->size - mtd->erasesize; while (mtd_block_isbad(mtd, *ofs)) { *ofs -= mtd->erasesize; -- cgit v1.2.3-18-g5258 From 71879d3cb3dd8f2dfdefb252775c1b3ea04a3dd4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:14:38 +0100 Subject: proc: mem_release() should check mm != NULL mem_release() can hit mm == NULL, add the necessary check. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9cde9edf9c4..c3617ea7830 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -822,8 +822,8 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig) static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; - - mmput(mm); + if (mm) + mmput(mm); return 0; } -- cgit v1.2.3-18-g5258 From 572d34b946bae070debd42db1143034d9687e13f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:14:54 +0100 Subject: proc: unify mem_read() and mem_write() No functional changes, cleanup and preparation. mem_read() and mem_write() are very similar. Move this code into the new common helper, mem_rw(), which takes the additional "int write" argument. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 90 +++++++++++++++++++++------------------------------------- 1 file changed, 32 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index c3617ea7830..be190904168 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -718,57 +718,13 @@ static int mem_open(struct inode* inode, struct file* file) return 0; } -static ssize_t mem_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) +static ssize_t mem_rw(struct file *file, char __user *buf, + size_t count, loff_t *ppos, int write) { - int ret; - char *page; - unsigned long src = *ppos; struct mm_struct *mm = file->private_data; - - if (!mm) - return 0; - - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - return -ENOMEM; - - ret = 0; - - while (count > 0) { - int this_len, retval; - - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - retval = access_remote_vm(mm, src, page, this_len, 0); - if (!retval) { - if (!ret) - ret = -EIO; - break; - } - - if (copy_to_user(buf, page, retval)) { - ret = -EFAULT; - break; - } - - ret += retval; - src += retval; - buf += retval; - count -= retval; - } - *ppos = src; - - free_page((unsigned long) page); - return ret; -} - -static ssize_t mem_write(struct file * file, const char __user *buf, - size_t count, loff_t *ppos) -{ - int copied; + unsigned long addr = *ppos; + ssize_t copied; char *page; - unsigned long dst = *ppos; - struct mm_struct *mm = file->private_data; if (!mm) return 0; @@ -779,30 +735,48 @@ static ssize_t mem_write(struct file * file, const char __user *buf, copied = 0; while (count > 0) { - int this_len, retval; + int this_len = min_t(int, count, PAGE_SIZE); - this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; - if (copy_from_user(page, buf, this_len)) { + if (write && copy_from_user(page, buf, this_len)) { copied = -EFAULT; break; } - retval = access_remote_vm(mm, dst, page, this_len, 1); - if (!retval) { + + this_len = access_remote_vm(mm, addr, page, this_len, write); + if (!this_len) { if (!copied) copied = -EIO; break; } - copied += retval; - buf += retval; - dst += retval; - count -= retval; + + if (!write && copy_to_user(buf, page, this_len)) { + copied = -EFAULT; + break; + } + + buf += this_len; + addr += this_len; + copied += this_len; + count -= this_len; } - *ppos = dst; + *ppos = addr; free_page((unsigned long) page); return copied; } +static ssize_t mem_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, buf, count, ppos, 0); +} + +static ssize_t mem_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return mem_rw(file, (char __user*)buf, count, ppos, 1); +} + loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { -- cgit v1.2.3-18-g5258 From 6d08f2c7139790c268820a2e590795cb8333181a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 31 Jan 2012 17:15:11 +0100 Subject: proc: make sure mem_open() doesn't pin the target's memory Once /proc/pid/mem is opened, the memory can't be released until mem_release() even if its owner exits. Change mem_open() to do atomic_inc(mm_count) + mmput(), this only pins mm_struct. Change mem_rw() to do atomic_inc_not_zero(mm_count) before access_remote_vm(), this verifies that this mm is still alive. I am not sure what should mem_rw() return if atomic_inc_not_zero() fails. With this patch it returns zero to match the "mm == NULL" case, may be it should return -EINVAL like it did before e268337d. Perhaps it makes sense to add the additional fatal_signal_pending() check into the main loop, to ensure we do not hold this memory if the target task was oom-killed. Cc: stable@kernel.org Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/proc/base.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index be190904168..d9512bd03e6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -711,6 +711,13 @@ static int mem_open(struct inode* inode, struct file* file) if (IS_ERR(mm)) return PTR_ERR(mm); + if (mm) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + /* OK to pass negative loff_t, we can catch out-of-range */ file->f_mode |= FMODE_UNSIGNED_OFFSET; file->private_data = mm; @@ -734,6 +741,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, return -ENOMEM; copied = 0; + if (!atomic_inc_not_zero(&mm->mm_users)) + goto free; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -761,6 +771,8 @@ static ssize_t mem_rw(struct file *file, char __user *buf, } *ppos = addr; + mmput(mm); +free: free_page((unsigned long) page); return copied; } @@ -797,7 +809,7 @@ static int mem_release(struct inode *inode, struct file *file) { struct mm_struct *mm = file->private_data; if (mm) - mmput(mm); + mmdrop(mm); return 0; } -- cgit v1.2.3-18-g5258 From 114fc47492e23d93653e4a16664833e98d62a563 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 11 Jan 2012 17:41:01 -0800 Subject: ceph: change "ceph.layout" xattr to be "ceph.file.layout" The virtual extended attribute named "ceph.layout" is meaningful only for regular files. Change its name to be "ceph.file.layout" to more directly reflect that in the ceph xattr namespace. Preserve the old "ceph.layout" name for the time being (until we decide it's safe to get rid of it entirely). Add a missing initializer for "readonly" in the terminating entry. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index a5e36e4488a..9e6734e38c1 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, } static struct ceph_vxattr_cb ceph_file_vxattrs[] = { + { true, "ceph.file.layout", ceph_vxattrcb_layout}, + /* The following extended attribute name is deprecated */ { true, "ceph.layout", ceph_vxattrcb_layout}, - { NULL, NULL } + { true, NULL, NULL } }; static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) -- cgit v1.2.3-18-g5258 From 32852a81bccd9e3d1953b894966393d1b546576d Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Sat, 14 Jan 2012 22:20:59 -0500 Subject: ceph: fix length validation in parse_reply_info() "len" is read from network and thus needs validation. Otherwise, given a bogus "len" value, p+len could be an out-of-bounds pointer, which is used in further parsing. Signed-off-by: Xi Wang Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 6203d805eb4..be1415fcaac 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; @@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { + ceph_decode_need(&p, end, len, bad); err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; -- cgit v1.2.3-18-g5258 From d8fb02abdc39f92a1066313e2b17047876afa8f9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 12 Jan 2012 17:48:10 -0800 Subject: ceph: create a new session lock to avoid lock inversion Lockdep was reporting a possible circular lock dependency in dentry_lease_is_valid(). That function needs to sample the session's s_cap_gen and and s_cap_ttl fields coherently, but needs to do so while holding a dentry lock. The s_cap_lock field was being used to protect the two fields, but that can't be taken while holding a lock on a dentry within the session. In most cases, the s_cap_gen and s_cap_ttl fields only get operated on separately. But in three cases they need to be updated together. Implement a new lock to protect the spots updating both fields atomically is required. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- fs/ceph/caps.c | 4 ++-- fs/ceph/dir.c | 4 ++-- fs/ceph/mds_client.c | 8 +++++--- fs/ceph/mds_client.h | 7 +++++-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8b53193e4f7..90d789df9ce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap) unsigned long ttl; u32 gen; - spin_lock(&cap->session->s_cap_lock); + spin_lock(&cap->session->s_gen_ttl_lock); gen = cap->session->s_cap_gen; ttl = cap->session->s_cap_ttl; - spin_unlock(&cap->session->s_cap_lock); + spin_unlock(&cap->session->s_gen_ttl_lock); if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { dout("__cap_is_valid %p cap %p issued %s " diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 98954003a8d..63c52f33361 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry) di = ceph_dentry(dentry); if (di && di->lease_session) { s = di->lease_session; - spin_lock(&s->s_cap_lock); + spin_lock(&s->s_gen_ttl_lock); gen = s->s_cap_gen; ttl = s->s_cap_ttl; - spin_unlock(&s->s_cap_lock); + spin_unlock(&s->s_gen_ttl_lock); if (di->lease_gen == gen && time_before(jiffies, dentry->d_time) && diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index be1415fcaac..a4fdf9397a9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -400,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; s->s_con.peer_name.num = cpu_to_le64(mds); - spin_lock_init(&s->s_cap_lock); + spin_lock_init(&s->s_gen_ttl_lock); s->s_cap_gen = 0; s->s_cap_ttl = 0; + + spin_lock_init(&s->s_cap_lock); s->s_renew_requested = 0; s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); @@ -2328,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_STALE: pr_info("mds%d caps went stale, renewing\n", session->s_mds); - spin_lock(&session->s_cap_lock); + spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; session->s_cap_ttl = 0; - spin_unlock(&session->s_cap_lock); + spin_unlock(&session->s_gen_ttl_lock); send_renew_caps(mdsc, session); break; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index a50ca0e3947..8c7c04ebb59 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -117,10 +117,13 @@ struct ceph_mds_session { void *s_authorizer_buf, *s_authorizer_reply_buf; size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; - /* protected by s_cap_lock */ - spinlock_t s_cap_lock; + /* protected by s_gen_ttl_lock */ + spinlock_t s_gen_ttl_lock; u32 s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire */ + + /* protected by s_cap_lock */ + spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ int s_nr_caps, s_trim_caps; int s_num_cap_releases; -- cgit v1.2.3-18-g5258 From 8cdb878dcb359fd1137e9abdee9322f5e9bcfdf8 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 2 Feb 2012 11:34:09 +1030 Subject: Fix race in process_vm_rw_core This fixes the race in process_vm_core found by Oleg (see http://article.gmane.org/gmane.linux.kernel/1235667/ for details). This has been updated since I last sent it as the creation of the new mm_access() function did almost exactly the same thing as parts of the previous version of this patch did. In order to use mm_access() even when /proc isn't enabled, we move it to kernel/fork.c where other related process mm access functions already are. Signed-off-by: Chris Yeoh Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index d9512bd03e6..d4548dd49b0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -198,26 +198,6 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) -{ - struct mm_struct *mm; - int err; - - err = mutex_lock_killable(&task->signal->cred_guard_mutex); - if (err) - return ERR_PTR(err); - - mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { - mmput(mm); - mm = ERR_PTR(-EACCES); - } - mutex_unlock(&task->signal->cred_guard_mutex); - - return mm; -} - struct mm_struct *mm_for_maps(struct task_struct *task) { return mm_access(task, PTRACE_MODE_READ); -- cgit v1.2.3-18-g5258 From de47a4176c532ef5961b8a46a2d541a3517412d3 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 2 Feb 2012 15:28:28 -0600 Subject: cifs: Fix oops in session setup code for null user mounts For null user mounts, do not invoke string length function during session setup. Cc: Acked-by: Jeff Layton Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/sess.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index eb767412177..551d0c2b973 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -246,16 +246,15 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, /* copy user */ /* BB what about null user mounts - check that we do this BB */ /* copy user */ - if (ses->user_name != NULL) + if (ses->user_name != NULL) { strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE); + bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); + } /* else null user mount */ - - bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE); *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ /* copy domain */ - if (ses->domainName != NULL) { strncpy(bcc_ptr, ses->domainName, 256); bcc_ptr += strnlen(ses->domainName, 256); -- cgit v1.2.3-18-g5258 From 331818f1c468a24e581aedcbe52af799366a9dfe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 3 Feb 2012 18:30:53 -0500 Subject: NFSv4: Fix an Oops in the NFSv4 getacl code Commit bf118a342f10dafe44b14451a1392c3254629a1f (NFSv4: include bitmap in nfsv4 get acl data) introduces the 'acl_scratch' page for the case where we may need to decode multi-page data. However it fails to take into account the fact that the variable may be NULL (for the case where we're not doing multi-page decode), and it also attaches it to the encoding xdr_stream rather than the decoding one. The immediate result is an Oops in nfs4_xdr_enc_getacl due to the call to page_address() with a NULL page pointer. Signed-off-by: Trond Myklebust Cc: Andy Adamson Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 8 ++++---- fs/nfs/nfs4xdr.c | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f0c849c98fe..d202e04aca9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3575,8 +3575,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu } if (npages > 1) { /* for decoding across pages */ - args.acl_scratch = alloc_page(GFP_KERNEL); - if (!args.acl_scratch) + res.acl_scratch = alloc_page(GFP_KERNEL); + if (!res.acl_scratch) goto out_free; } args.acl_len = npages * PAGE_SIZE; @@ -3612,8 +3612,8 @@ out_free: for (i = 0; i < npages; i++) if (pages[i]) __free_page(pages[i]); - if (args.acl_scratch) - __free_page(args.acl_scratch); + if (res.acl_scratch) + __free_page(res.acl_scratch); return ret; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 95e92e43840..33bd8d0f745 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2522,7 +2522,6 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, xdr_inline_pages(&req->rq_rcv_buf, replen << 2, args->acl_pages, args->acl_pgbase, args->acl_len); - xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE); encode_nops(&hdr); } @@ -6032,6 +6031,10 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct compound_hdr hdr; int status; + if (res->acl_scratch != NULL) { + void *p = page_address(res->acl_scratch); + xdr_set_scratch_buffer(xdr, p, PAGE_SIZE); + } status = decode_compound_hdr(xdr, &hdr); if (status) goto out; -- cgit v1.2.3-18-g5258 From 96e02d1586782eadf051fa3d6bc4132d2447ac2c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 4 Feb 2012 10:47:10 +0100 Subject: exec: fix use-after-free bug in setup_new_exec() Setting the task name is done within setup_new_exec() by accessing bprm->filename. However this happens after flush_old_exec(). This may result in a use after free bug, flush_old_exec() may "complete" vfork_done, which will wake up the parent which in turn may free the passed in filename. To fix this add a new tcomm field in struct linux_binprm which contains the now early generated task name until it is used. Fixes this bug on s390: Unable to handle kernel pointer dereference at virtual kernel address 0000000039768000 Process kworker/u:3 (pid: 245, task: 000000003a3dc840, ksp: 0000000039453818) Krnl PSW : 0704000180000000 0000000000282e94 (setup_new_exec+0xa0/0x374) Call Trace: ([<0000000000282e2c>] setup_new_exec+0x38/0x374) [<00000000002dd12e>] load_elf_binary+0x402/0x1bf4 [<0000000000280a42>] search_binary_handler+0x38e/0x5bc [<0000000000282b6c>] do_execve_common+0x410/0x514 [<0000000000282cb6>] do_execve+0x46/0x58 [<00000000005bce58>] kernel_execve+0x28/0x70 [<000000000014ba2e>] ____call_usermodehelper+0x102/0x140 [<00000000005bc8da>] kernel_thread_starter+0x6/0xc [<00000000005bc8d4>] kernel_thread_starter+0x0/0xc Last Breaking-Event-Address: [<00000000002830f0>] setup_new_exec+0x2fc/0x374 Kernel panic - not syncing: Fatal exception: panic_on_oops Reported-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Linus Torvalds --- fs/exec.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index aeb135c7ff5..92ce83a11e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1071,6 +1071,21 @@ void set_task_comm(struct task_struct *tsk, char *buf) perf_event_comm(tsk); } +static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len) +{ + int i, ch; + + /* Copies the binary name from after last slash */ + for (i = 0; (ch = *(fn++)) != '\0';) { + if (ch == '/') + i = 0; /* overwrite what we wrote */ + else + if (i < len - 1) + tcomm[i++] = ch; + } + tcomm[i] = '\0'; +} + int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1085,6 +1100,7 @@ int flush_old_exec(struct linux_binprm * bprm) set_mm_exe_file(bprm->mm, bprm->file); + filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm)); /* * Release all of the old mmap stuff */ @@ -1116,10 +1132,6 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { - int i, ch; - const char *name; - char tcomm[sizeof(current->comm)]; - arch_pick_mmap_layout(current->mm); /* This is the point of no return */ @@ -1130,18 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); - name = bprm->filename; - - /* Copies the binary name from after last slash */ - for (i=0; (ch = *(name++)) != '\0';) { - if (ch == '/') - i = 0; /* overwrite what we wrote */ - else - if (i < (sizeof(tcomm) - 1)) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; - set_task_comm(current, tcomm); + set_task_comm(current, bprm->tcomm); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on -- cgit v1.2.3-18-g5258 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- fs/ioprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e..0f1b9515213 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc, NULL); + put_io_context(ioc); } return err; -- cgit v1.2.3-18-g5258 From 4edc53c1f8cdd99d349165d6c61c45aa4e8e2564 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 Feb 2012 06:30:51 -0500 Subject: cifs: fix error handling when cifscreds key payload is an error Reported-by: Dan Carpenter Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9c288653e6d..940189bd649 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2125,7 +2125,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses) down_read(&key->sem); upayload = key->payload.data; if (IS_ERR_OR_NULL(upayload)) { - rc = PTR_ERR(key); + rc = upayload ? PTR_ERR(upayload) : -EINVAL; goto out_key_put; } -- cgit v1.2.3-18-g5258 From 8b0192a5f478da1c1ae906bf3ffff53f26204f56 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 Feb 2012 06:30:52 -0500 Subject: cifs: request oplock when doing open on lookup Currently, it's always set to 0 (no oplock requested). Cc: Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index df8fecb5b99..63a196b97d5 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -492,7 +492,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, { int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ - __u32 oplock = 0; + __u32 oplock = enable_oplocks ? REQ_OPLOCK : 0; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; -- cgit v1.2.3-18-g5258 From ff4fa4a25a33f92b5653bb43add0c63bea98d464 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 Feb 2012 06:31:05 -0500 Subject: cifs: don't return error from standard_receive3 after marking response malformed standard_receive3 will check the validity of the response from the server (via checkSMB). It'll pass the result of that check to handle_mid which will dequeue it and mark it with a status of MID_RESPONSE_MALFORMED if checkSMB returned an error. At that point, standard_receive3 will also return an error, which will make the demultiplex thread skip doing the callback for the mid. This is wrong -- if we were able to identify the request and the response is marked malformed, then we want the demultiplex thread to do the callback. Fix this by making standard_receive3 return 0 in this situation. Cc: stable@vger.kernel.org Reported-and-Tested-by: Mark Moseley Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 940189bd649..602f77c304c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -773,10 +773,11 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); - if (mid) - handle_mid(mid, server, smb_buffer, length); + if (!mid) + return length; - return length; + handle_mid(mid, server, smb_buffer, length); + return 0; } static int -- cgit v1.2.3-18-g5258 From 5abebfdd02450fa1349daacf242e70b3736581e3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 8 Feb 2012 22:07:18 +0100 Subject: bio: don't overflow in bio_get_nr_vecs() There were two places bio_get_nr_vecs() could overflow: First, it did a left shift to convert from sectors to bytes immediately before dividing by PAGE_SIZE. If PAGE_SIZE ever was less than 512 a great many things would break, so dividing by PAGE_SIZE >> 9 is safe and will generate smaller code too. The nastier overflow was in the DIV_ROUND_UP() (that's what the code was effectively doing, anyways). If n + d overflowed, the whole thing would return 0 which breaks things rather effectively. bio_get_nr_vecs() doesn't claim to give an exact value anyways, so the DIV_ROUND_UP() is silly; we could do a straight divide except if a device's queue_max_sectors was less than PAGE_SIZE we'd return 0. So we just add 1; this should always be safe - things will break badly if bio_get_nr_vecs() returns > BIO_MAX_PAGES (bio_alloc() will suddenly start failing) but it's queue_max_segments that must guard against this, if queue_max_sectors is preventing this from happen things are going to explode on architectures with different PAGE_SIZE. Signed-off-by: Kent Overstreet Cc: Tejun Heo Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index b1fe82cf88c..b980ecde026 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -505,13 +505,9 @@ EXPORT_SYMBOL(bio_clone); int bio_get_nr_vecs(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > queue_max_segments(q)) - nr_pages = queue_max_segments(q); - - return nr_pages; + return min_t(unsigned, + queue_max_segments(q), + queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); } EXPORT_SYMBOL(bio_get_nr_vecs); -- cgit v1.2.3-18-g5258 From 1ecd3c7ea76488c63b4b0a2561fd7eaf96cc8028 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 8 Feb 2012 17:13:37 -0800 Subject: nilfs2: avoid overflowing segment numbers in nilfs_ioctl_clean_segments() nsegs is read from userspace. Limit its value and avoid overflowing nsegs * sizeof(__u64) in the subsequent call to memdup_user(). This patch complements 481fe17e973fb9 ("nilfs2: potential integer overflow in nilfs_ioctl_clean_segments()"). Signed-off-by: Xi Wang Cc: Haogang Chen Acked-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 886649627c3..2a70fce70c6 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -603,6 +603,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, nsegs = argv[4].v_nmembs; if (argv[4].v_size != argsz[4]) goto out; + if (nsegs > UINT_MAX / sizeof(__u64)) + goto out; /* * argv[4] points to segment numbers this ioctl cleans. We -- cgit v1.2.3-18-g5258 From b9f9a03150969e4bd9967c20bce67c4de769058f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 9 Feb 2012 15:31:36 -0500 Subject: NFSv4: Ensure we throw out bad delegation stateids on NFS4ERR_BAD_STATEID To ensure that we don't just reuse the bad delegation when we attempt to recover the nfs4_state that received the bad stateid error. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4state.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a53f33b4ac3..45392032e7b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1132,6 +1132,8 @@ void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4 { struct nfs_client *clp = server->nfs_client; + if (test_and_clear_bit(NFS_DELEGATED_STATE, &state->flags)) + nfs_async_inode_return_delegation(state->inode, &state->stateid); nfs4_state_mark_reclaim_nograce(clp, state); nfs4_schedule_state_manager(clp); } -- cgit v1.2.3-18-g5258 From 04da0c8196ac0b12fb6b84f4b7a51ad2fa56d869 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2012 13:57:20 +0000 Subject: xfs: use a normal shrinker for the dquot freelist Stop reusing dquots from the freelist when allocating new ones directly, and implement a shrinker that actually follows the specifications for the interface. The shrinker implementation is still highly suboptimal at this point, but we can gradually work on it. This also fixes an bug in the previous lock ordering, where we would take the hash and dqlist locks inside of the freelist lock against the normal lock ordering. This is only solvable by introducing the dispose list, and thus not when using direct reclaim of unused dquots for new allocations. As a side-effect the quota upper bound and used to free ratio values in /proc/fs/xfs/xqm are set to 0 as these values don't make any sense in the new world order. Signed-off-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/kmem.h | 6 -- fs/xfs/xfs_dquot.c | 103 +++++------------- fs/xfs/xfs_qm.c | 291 +++++++++++++++++++------------------------------- fs/xfs/xfs_qm.h | 14 --- fs/xfs/xfs_qm_stats.c | 4 +- fs/xfs/xfs_trace.h | 5 +- 6 files changed, 141 insertions(+), 282 deletions(-) (limited to 'fs') diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 292eff19803..ab7c53fe346 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -110,10 +110,4 @@ kmem_zone_destroy(kmem_zone_t *zone) extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -static inline int -kmem_shake_allow(gfp_t gfp_mask) -{ - return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); -} - #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b4ff40b5f91..cbcb7bea38e 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -62,82 +62,6 @@ int xfs_dqerror_mod = 33; static struct lock_class_key xfs_dquot_other_class; -/* - * Allocate and initialize a dquot. We don't always allocate fresh memory; - * we try to reclaim a free dquot if the number of incore dquots are above - * a threshold. - * The only field inside the core that gets initialized at this point - * is the d_id field. The idea is to fill in the entire q_core - * when we read in the on disk dquot. - */ -STATIC xfs_dquot_t * -xfs_qm_dqinit( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type) -{ - xfs_dquot_t *dqp; - boolean_t brandnewdquot; - - brandnewdquot = xfs_qm_dqalloc_incore(&dqp); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); - dqp->q_mount = mp; - - /* - * No need to re-initialize these if this is a reclaimed dquot. - */ - if (brandnewdquot) { - INIT_LIST_HEAD(&dqp->q_freelist); - mutex_init(&dqp->q_qlock); - init_waitqueue_head(&dqp->q_pinwait); - - /* - * Because we want to use a counting completion, complete - * the flush completion once to allow a single access to - * the flush completion without blocking. - */ - init_completion(&dqp->q_flush); - complete(&dqp->q_flush); - - trace_xfs_dqinit(dqp); - } else { - /* - * Only the q_core portion was zeroed in dqreclaim_one(). - * So, we need to reset others. - */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - INIT_LIST_HEAD(&dqp->q_mplist); - INIT_LIST_HEAD(&dqp->q_hashlist); - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - atomic_set(&dqp->q_pincount, 0); - dqp->q_hash = NULL; - ASSERT(list_empty(&dqp->q_freelist)); - - trace_xfs_dqreuse(dqp); - } - - /* - * In either case we need to make sure group quotas have a different - * lock class than user quotas, to make sure lockdep knows we can - * locks of one of each at the same time. - */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); - - /* - * log item gets initialized later - */ - return (dqp); -} - /* * This is called to free all the memory associated with a dquot */ @@ -567,7 +491,32 @@ xfs_qm_dqread( int error; int cancelflags = 0; - dqp = xfs_qm_dqinit(mp, id, type); + + dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_freelist); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + if (!(type & XFS_DQ_USER)) + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + + atomic_inc(&xfs_Gqm->qm_totaldquots); trace_xfs_dqread(dqp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 671f37eae1c..c436def733b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -50,7 +50,6 @@ */ struct mutex xfs_Gqm_lock; struct xfs_qm *xfs_Gqm; -uint ndquot; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; @@ -93,7 +92,6 @@ xfs_Gqm_init(void) goto out_free_udqhash; hsize /= sizeof(xfs_dqhash_t); - ndquot = hsize << 8; xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); xqm->qm_dqhashmask = hsize - 1; @@ -137,7 +135,6 @@ xfs_Gqm_init(void) xqm->qm_dqtrxzone = qm_dqtrxzone; atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; xqm->qm_nrefs = 0; return xqm; @@ -1600,216 +1597,150 @@ xfs_qm_init_quotainos( return 0; } +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + mutex_lock(&dqp->q_hash->qh_lock); + list_del_init(&dqp->q_hashlist); + dqp->q_hash->qh_version++; + mutex_unlock(&dqp->q_hash->qh_lock); -/* - * Pop the least recently used dquot off the freelist and recycle it. - */ -STATIC struct xfs_dquot * -xfs_qm_dqreclaim_one(void) + mutex_lock(&qi->qi_dqlist_lock); + list_del_init(&dqp->q_mplist); + qi->qi_dquots--; + qi->qi_dqreclaims++; + mutex_unlock(&qi->qi_dqlist_lock); + + xfs_qm_dqdestroy(dqp); +} + +STATIC void +xfs_qm_dqreclaim_one( + struct xfs_dquot *dqp, + struct list_head *dispose_list) { - struct xfs_dquot *dqp; - int restarts = 0; + struct xfs_mount *mp = dqp->q_mount; + int error; - mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); -restart: - list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) { - struct xfs_mount *mp = dqp->q_mount; + if (!xfs_dqlock_nowait(dqp)) + goto out_busy; - if (!xfs_dqlock_nowait(dqp)) - continue; + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); - /* - * This dquot has already been grabbed by dqlookup. - * Remove it from the freelist and try again. - */ - if (dqp->q_nrefs) { - trace_xfs_dqreclaim_want(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; - restarts++; - goto dqunlock; - } + trace_xfs_dqreclaim_want(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqwants); - ASSERT(dqp->q_hash); - ASSERT(!list_empty(&dqp->q_mplist)); + list_del_init(&dqp->q_freelist); + xfs_Gqm->qm_dqfrlist_cnt--; + return; + } - /* - * Try to grab the flush lock. If this dquot is in the process - * of getting flushed to disk, we don't want to reclaim it. - */ - if (!xfs_dqflock_nowait(dqp)) - goto dqunlock; + ASSERT(dqp->q_hash); + ASSERT(!list_empty(&dqp->q_mplist)); - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - int error; + /* + * Try to grab the flush lock. If this dquot is in the process of + * getting flushed to disk, we don't want to reclaim it. + */ + if (!xfs_dqflock_nowait(dqp)) + goto out_busy; - trace_xfs_dqreclaim_dirty(dqp); + /* + * We have the flush lock so we know that this is not in the + * process of being flushed. So, if this is dirty, flush it + * DELWRI so that we don't get a freelist infested with + * dirty dquots. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + trace_xfs_dqreclaim_dirty(dqp); - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - error = xfs_qm_dqflush(dqp, SYNC_TRYLOCK); - if (error) { - xfs_warn(mp, "%s: dquot %p flush failed", - __func__, dqp); - } - goto dqunlock; + /* + * We flush it delayed write, so don't bother releasing the + * freelist lock. + */ + error = xfs_qm_dqflush(dqp, 0); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); } - xfs_dqfunlock(dqp); /* - * Prevent lookup now that we are going to reclaim the dquot. - * Once XFS_DQ_FREEING is set lookup won't touch the dquot, - * thus we can drop the lock now. + * Give the dquot another try on the freelist, as the + * flushing will take some time. */ - dqp->dq_flags |= XFS_DQ_FREEING; - xfs_dqunlock(dqp); - - mutex_lock(&dqp->q_hash->qh_lock); - list_del_init(&dqp->q_hashlist); - dqp->q_hash->qh_version++; - mutex_unlock(&dqp->q_hash->qh_lock); - - mutex_lock(&mp->m_quotainfo->qi_dqlist_lock); - list_del_init(&dqp->q_mplist); - mp->m_quotainfo->qi_dquots--; - mp->m_quotainfo->qi_dqreclaims++; - mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock); + goto out_busy; + } + xfs_dqfunlock(dqp); - ASSERT(dqp->q_nrefs == 0); - list_del_init(&dqp->q_freelist); - xfs_Gqm->qm_dqfrlist_cnt--; + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return dqp; -dqunlock: - xfs_dqunlock(dqp); - if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - break; - goto restart; - } + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_freelist, dispose_list); + xfs_Gqm->qm_dqfrlist_cnt--; - mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - return NULL; -} + trace_xfs_dqreclaim_done(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); + return; -/* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - */ -STATIC int -xfs_qm_shake_freelist( - int howmany) -{ - int nreclaimed = 0; - xfs_dquot_t *dqp; +out_busy: + xfs_dqunlock(dqp); - if (howmany <= 0) - return 0; + /* + * Move the dquot to the tail of the list so that we don't spin on it. + */ + list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist); - while (nreclaimed < howmany) { - dqp = xfs_qm_dqreclaim_one(); - if (!dqp) - return nreclaimed; - xfs_qm_dqdestroy(dqp); - nreclaimed++; - } - return nreclaimed; + trace_xfs_dqreclaim_busy(dqp); + XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); } -/* - * The kmem_shake interface is invoked when memory is running low. - */ -/* ARGSUSED */ STATIC int xfs_qm_shake( - struct shrinker *shrink, - struct shrink_control *sc) + struct shrinker *shrink, + struct shrink_control *sc) { - int ndqused, nfree, n; - gfp_t gfp_mask = sc->gfp_mask; - - if (!kmem_shake_allow(gfp_mask)) - return 0; - if (!xfs_Gqm) - return 0; - - nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */ - /* incore dquots in all f/s's */ - ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; - - ASSERT(ndqused >= 0); + int nr_to_scan = sc->nr_to_scan; + LIST_HEAD (dispose_list); + struct xfs_dquot *dqp; - if (nfree <= ndqused && nfree < ndquot) + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; + if (!nr_to_scan) + goto out; - ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ - n = nfree - ndqused - ndquot; /* # over target */ - - return xfs_qm_shake_freelist(MAX(nfree, n)); -} - - -/*------------------------------------------------------------------*/ - -/* - * Return a new incore dquot. Depending on the number of - * dquots in the system, we either allocate a new one on the kernel heap, - * or reclaim a free one. - * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed - * to reclaim an existing one from the freelist. - */ -boolean_t -xfs_qm_dqalloc_incore( - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - - /* - * Check against high water mark to see if we want to pop - * a nincompoop dquot off the freelist. - */ - if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { - /* - * Try to recycle a dquot from the freelist. - */ - if ((dqp = xfs_qm_dqreclaim_one())) { - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); - /* - * Just zero the core here. The rest will get - * reinitialized by caller. XXX we shouldn't even - * do this zero ... - */ - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - *O_dqpp = dqp; - return B_FALSE; - } - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); + mutex_lock(&xfs_Gqm->qm_dqfrlist_lock); + while (!list_empty(&xfs_Gqm->qm_dqfrlist)) { + if (nr_to_scan-- <= 0) + break; + dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot, + q_freelist); + xfs_qm_dqreclaim_one(dqp, &dispose_list); } + mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock); - /* - * Allocate a brand new dquot on the kernel heap and return it - * to the caller to initialize. - */ - ASSERT(xfs_Gqm->qm_dqzone != NULL); - *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); - atomic_inc(&xfs_Gqm->qm_totaldquots); - - return B_TRUE; + while (!list_empty(&dispose_list)) { + dqp = list_first_entry(&dispose_list, struct xfs_dquot, + q_freelist); + list_del_init(&dqp->q_freelist); + xfs_qm_dqfree_one(dqp); + } +out: + return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure; } - /* * Start a transaction and write the incore superblock changes to * disk. flags parameter indicates which fields have changed. diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9b4f3adefbc..9a9b997e1a0 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -26,23 +26,11 @@ struct xfs_qm; struct xfs_inode; -extern uint ndquot; extern struct mutex xfs_Gqm_lock; extern struct xfs_qm *xfs_Gqm; extern kmem_zone_t *qm_dqzone; extern kmem_zone_t *qm_dqtrxzone; -/* - * Ditto, for xfs_qm_dqreclaim_one. - */ -#define XFS_QM_RECLAIM_MAX_RESTARTS 4 - -/* - * Ideal ratio of free to in use dquots. Quota manager makes an attempt - * to keep this balance. - */ -#define XFS_QM_DQFREE_RATIO 2 - /* * Dquot hashtable constants/threshold values. */ @@ -74,7 +62,6 @@ typedef struct xfs_qm { int qm_dqfrlist_cnt; atomic_t qm_totaldquots; /* total incore dquots */ uint qm_nrefs; /* file systems with quota on */ - int qm_dqfree_ratio;/* ratio of free to inuse dquots */ kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ } xfs_qm_t; @@ -143,7 +130,6 @@ extern int xfs_qm_quotacheck(xfs_mount_t *); extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); /* dquot stuff */ -extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c index 8671a0b3264..5729ba57087 100644 --- a/fs/xfs/xfs_qm_stats.c +++ b/fs/xfs/xfs_qm_stats.c @@ -42,9 +42,9 @@ static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ seq_printf(m, "%d\t%d\t%d\t%u\n", - ndquot, + 0, xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, + 0, xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0); return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6b6df5802e9..bb134a81993 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -733,11 +733,10 @@ DEFINE_EVENT(xfs_dquot_class, name, \ DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); -DEFINE_DQUOT_EVENT(xfs_dqinit); -DEFINE_DQUOT_EVENT(xfs_dqreuse); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); DEFINE_DQUOT_EVENT(xfs_dqread); -- cgit v1.2.3-18-g5258 From 05293485a0b6b1f803e8a3c0ff188c38f6969985 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 13 Feb 2012 20:51:05 +0000 Subject: XFS: xfs_trans_add_item() - don't assign in ASSERT() when compare is intended It looks to me like the two ASSERT()s in xfs_trans_add_item() really want to do a compare (==) rather than assignment (=). This patch changes it from the latter to the former. Signed-off-by: Jesper Juhl Signed-off-by: Ben Myers --- fs/xfs/xfs_trans.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 329b06aba1c..7adcdf15ae0 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1151,8 +1151,8 @@ xfs_trans_add_item( { struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp = tp->t_mountp); - ASSERT(lip->li_ailp = tp->t_mountp->m_ail); + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); -- cgit v1.2.3-18-g5258 From e188dc02d3a9c911be56eca5aa114fe7e9822d53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 3 Feb 2012 14:25:18 +0100 Subject: vfs: fix d_inode_lookup() dentry ref leak d_inode_lookup() leaks a dentry reference on IS_DEADDIR(). Signed-off-by: Miklos Szeredi CC: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 208c6aa4a98..a780ea515c4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1095,8 +1095,10 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr struct dentry *old; /* Don't create child dentry for a dead directory. */ - if (unlikely(IS_DEADDIR(inode))) + if (unlikely(IS_DEADDIR(inode))) { + dput(dentry); return ERR_PTR(-ENOENT); + } old = inode->i_op->lookup(inode, dentry, nd); if (unlikely(old)) { -- cgit v1.2.3-18-g5258 From 1d6f2097865e64963e90cce04980dce2f9fc023f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 22 Aug 2011 11:52:28 +0800 Subject: autofs4 - fix lockdep splat in autofs When recursing down the locks when traversing a tree/list in get_next_positive_dentry() or get_next_positive_subdir() a lock can change from being nested to being a parent which breaks lockdep. This patch tells lockdep about what we did. Signed-off-by: Steven Rostedt Acked-by: Ian Kent Signed-off-by: Al Viro --- fs/autofs4/expire.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 450f529a4ea..1feb68ecef9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -124,6 +124,7 @@ start: /* Negative dentry - try next */ if (!simple_positive(q)) { spin_unlock(&p->d_lock); + lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_); p = q; goto again; } @@ -186,6 +187,7 @@ again: /* Negative dentry - try next */ if (!simple_positive(ret)) { spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); p = ret; goto again; } -- cgit v1.2.3-18-g5258 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc0..fe19ac13f75 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad..d3ebdbe723d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } -- cgit v1.2.3-18-g5258 From 6b6dc836a195e077e76977b6c020a73de411b46d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Feb 2012 11:03:00 +0100 Subject: vfs: Provide function to get superblock and wait for it to thaw In quota code we need to find a superblock corresponding to a device and wait for superblock to be unfrozen. However this waiting has to happen without s_umount semaphore because that is required for superblock to thaw. So provide a function in VFS for this to keep dances with s_umount where they belong. [AV: implementation switched to saner variant] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 6015c02296b..6277ec6cb60 100644 --- a/fs/super.c +++ b/fs/super.c @@ -633,6 +633,28 @@ rescan: EXPORT_SYMBOL(get_super); +/** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + while (1) { + struct super_block *s = get_super(bdev); + if (!s || s->s_frozen == SB_UNFROZEN) + return s; + up_read(&s->s_umount); + vfs_check_frozen(s, SB_FREEZE_WRITE); + put_super(s); + } +} +EXPORT_SYMBOL(get_super_thawed); + /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for -- cgit v1.2.3-18-g5258 From dcdbed853d9fbb0547b781ba676049b87f54129a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 10 Feb 2012 11:03:01 +0100 Subject: quota: Fix deadlock with suspend and quotas This script causes a kernel deadlock: set -e DEVICE=/dev/vg1/linear lvchange -ay $DEVICE mkfs.ext3 $DEVICE mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test quotacheck -gu /mnt/test umount /mnt/test mount -t ext3 -o usrquota,grpquota $DEVICE /mnt/test quotaon /mnt/test dmsetup suspend $DEVICE setquota -u root 1 2 3 4 /mnt/test & sleep 1 dmsetup resume $DEVICE setquota acquired semaphore s_umount for read and then tried to perform a transaction (and waits because the device is suspended). dmsetup resume tries to acquire s_umount for write before resuming the device (and waits for setquota). Fix the deadlock by grabbing a thawed superblock for quota commands which need it. Reported-by: Mikulas Patocka Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/quota/quota.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 7898cd688a0..fc2c4388d12 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -292,11 +292,26 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, } } +/* Return 1 if 'cmd' will block on frozen filesystem */ +static int quotactl_cmd_write(int cmd) +{ + switch (cmd) { + case Q_GETFMT: + case Q_GETINFO: + case Q_SYNC: + case Q_XGETQSTAT: + case Q_XGETQUOTA: + case Q_XQUOTASYNC: + return 0; + } + return 1; +} + /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon */ -static struct super_block *quotactl_block(const char __user *special) +static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK struct block_device *bdev; @@ -309,7 +324,10 @@ static struct super_block *quotactl_block(const char __user *special) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); - sb = get_super(bdev); + if (quotactl_cmd_write(cmd)) + sb = get_super_thawed(bdev); + else + sb = get_super(bdev); bdput(bdev); if (!sb) return ERR_PTR(-ENODEV); @@ -361,7 +379,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, pathp = &path; } - sb = quotactl_block(special); + sb = quotactl_block(special, cmds); if (IS_ERR(sb)) { ret = PTR_ERR(sb); goto out; -- cgit v1.2.3-18-g5258 From fcf83067bf6eb101a35620d752bd559d473cfbaa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Feb 2012 20:56:29 -0500 Subject: vfs: fix compat_sys_stat() handling of overflows in st_nlink Massaged cp_compat_stat() into form closer to cp_new_stat(); the only real issue had been in handling of st_nlink overflows - native 32bit stat(2) returns -EOVERFLOW in such situations, compat one silently loses upper bits. Signed-off-by: Al Viro --- fs/compat.c | 56 +++++++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index fa9d721ecfe..07880bae28a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -131,41 +131,35 @@ asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_tim static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { - compat_ino_t ino = stat->ino; - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - int err; + struct compat_stat tmp; - SET_UID(uid, stat->uid); - SET_GID(gid, stat->gid); + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; - if ((u64) stat->size > MAX_NON_LFS || - !old_valid_dev(stat->dev) || - !old_valid_dev(stat->rdev)) + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) return -EOVERFLOW; - if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) return -EOVERFLOW; - - if (clear_user(ubuf, sizeof(*ubuf))) - return -EFAULT; - - err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); - err |= __put_user(ino, &ubuf->st_ino); - err |= __put_user(stat->mode, &ubuf->st_mode); - err |= __put_user(stat->nlink, &ubuf->st_nlink); - err |= __put_user(uid, &ubuf->st_uid); - err |= __put_user(gid, &ubuf->st_gid); - err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); - err |= __put_user(stat->size, &ubuf->st_size); - err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); - err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); - err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); - err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); - err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); - err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); - err |= __put_user(stat->blksize, &ubuf->st_blksize); - err |= __put_user(stat->blocks, &ubuf->st_blocks); - return err; + SET_UID(tmp.st_uid, stat->uid); + SET_GID(tmp.st_gid, stat->gid); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; } asmlinkage long compat_sys_newstat(const char __user * filename, -- cgit v1.2.3-18-g5258 From 847c9db5cb50841589b8ebd3da0769b1b02fb3b2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Feb 2012 21:00:05 -0500 Subject: ocfs2: deal with wraparounds of i_nlink in ocfs2_rename() unfortunately, nlink_t may be smaller than 32 bits and ->i_nlink on ocfs2 can grow up to 0xffffffff; storing it in nlink_t variable will lose upper bits on such architectures. Needs to be made u32, until we get kernel-side nlink_t uniformly 32bit... Signed-off-by: Al Viro --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index be244692550..a9856e3eaaf 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1053,7 +1053,7 @@ static int ocfs2_rename(struct inode *old_dir, handle_t *handle = NULL; struct buffer_head *old_dir_bh = NULL; struct buffer_head *new_dir_bh = NULL; - nlink_t old_dir_nlink = old_dir->i_nlink; + u32 old_dir_nlink = old_dir->i_nlink; struct ocfs2_dinode *old_di; struct ocfs2_dir_lookup_result old_inode_dot_dot_res = { NULL, }; struct ocfs2_dir_lookup_result target_lookup_res = { NULL, }; -- cgit v1.2.3-18-g5258 From 941b2ddf71987ef369389517a7e215dd505fe01e Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 29 Nov 2011 17:44:12 -0800 Subject: btrfs: Sector Size check during Mount Gracefully fail when trying to mount a BTRFS file system that has a sectorsize smaller than PAGE_SIZE. On PPC it is possible to build a FS while using a 4k PAGE_SIZE kernel then boot into a 64K PAGE_SIZE kernel. Presently open_ctree fails in an endless loop and hangs the machine in this situation. My debugging has show this Sector size < Page size to be a non trivial situation and a graceful exit from the situation would be nice for the time being. Signed-off-by: Keith Mannthey --- fs/btrfs/disk-io.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c867112b4c..58d0678dfcb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2258,6 +2258,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_sb_buffer; } + if (sectorsize < PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size " + "found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); -- cgit v1.2.3-18-g5258 From 8f24b49688281a77e8331894ed407f0cfe732303 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 8 Feb 2012 16:01:01 +0100 Subject: Btrfs: avoid positive number with ERR_PTR inode_ref_info() returns 1 when the element wasn't found and < 0 on error, just like btrfs_search_slot(). In iref_to_path() it's an error when the inode ref can't be found, thus we return ERR_PTR(ret) in that case. In order to avoid ERR_PTR(1), we now set ret to -ENOENT in that case. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 633c701a287..98f6bf10bbd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -892,6 +892,8 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, if (eb != eb_in) free_extent_buffer(eb); ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + if (ret > 0) + ret = -ENOENT; if (ret) break; next_inum = found_key.offset; -- cgit v1.2.3-18-g5258 From 6af021d8fc3bcce790e7fbb391e39c5920fa3f71 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Thu, 9 Feb 2012 14:25:50 +0800 Subject: Btrfs: return the internal error unchanged if btrfs_get_extent_fiemap() call failed for SEEK_DATA/SEEK_HOLE inquiry Given that ENXIO only means "offset beyond EOF" for either SEEK_DATA or SEEK_HOLE inquiry in a desired file range, so we should return the internal error unchanged if btrfs_get_extent_fiemap() call failed, rather than ENXIO. Cc: Dave Chinner Signed-off-by: Jie Liu --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0621a3a7d5d..3a520899b02 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1755,7 +1755,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) start - root->sectorsize, root->sectorsize, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); goto out; } last_end = em->start + em->len; @@ -1767,7 +1767,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) while (1) { em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0); if (IS_ERR(em)) { - ret = -ENXIO; + ret = PTR_ERR(em); break; } -- cgit v1.2.3-18-g5258 From 2cac13e41bf5b99ffc426bd28dfd2248df1dfa67 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 9 Feb 2012 18:17:41 +0800 Subject: Btrfs: fix trim 0 bytes after a device delete A user reported a bug of btrfs's trim, that is we will trim 0 bytes after a device delete. The reproducer: $ mkfs.btrfs disk1 $ mkfs.btrfs disk2 $ mount disk1 /mnt $ fstrim -v /mnt $ btrfs device add disk2 /mnt $ btrfs device del disk1 /mnt $ fstrim -v /mnt This is because after we delete the device, the block group may start from a non-zero place, which will confuse trim to discard nothing. Reported-by: Lutz Euler Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 283af7a676a..60bfe2d6854 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7886,9 +7886,16 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) u64 start; u64 end; u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); int ret = 0; - cache = btrfs_lookup_block_group(fs_info, range->start); + /* + * try to trim all FS space, our block group may start from non-zero. + */ + if (range->len == total_bytes) + cache = btrfs_lookup_first_block_group(fs_info, range->start); + else + cache = btrfs_lookup_block_group(fs_info, range->start); while (cache) { if (cache->key.objectid >= (range->start + range->len)) { -- cgit v1.2.3-18-g5258 From 859acaf1a29bbacf6256f1159210c8d6df992b33 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 9 Feb 2012 15:09:02 +0100 Subject: btrfs: don't check DUP chunks twice Because scrub enumerates the dev extent tree to find the chunks to scrub, it currently finds each DUP chunk twice and also scrubs it twice. This patch makes sure that scrub_chunk only checks that part of the chunk the dev extent has been found for. This only changes the behaviour for DUP chunks. Reported-and-tested-by: Stefan Behrens Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9770cc5bfb7..abc0fbffa51 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1367,7 +1367,8 @@ out: } static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length) + u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, + u64 dev_offset) { struct btrfs_mapping_tree *map_tree = &sdev->dev->dev_root->fs_info->mapping_tree; @@ -1391,7 +1392,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, goto out; for (i = 0; i < map->num_stripes; ++i) { - if (map->stripes[i].dev == sdev->dev) { + if (map->stripes[i].dev == sdev->dev && + map->stripes[i].physical == dev_offset) { ret = scrub_stripe(sdev, map, i, chunk_offset, length); if (ret) goto out; @@ -1487,7 +1489,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, - chunk_offset, length); + chunk_offset, length, found_key.offset); btrfs_put_block_group(cache); if (ret) break; -- cgit v1.2.3-18-g5258 From a7e221e9002306a753d6f78b4060edabce402033 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Tue, 14 Feb 2012 17:12:23 +0900 Subject: Btrfs: fix memory leak in load_free_space_cache() load_free_space_cache() has forgotten to free path. Signed-off-by: Tsutomu Itoh --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5802b1473c3..b30242f4435 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -777,6 +777,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->lock); if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { spin_unlock(&block_group->lock); + btrfs_free_path(path); goto out; } spin_unlock(&block_group->lock); -- cgit v1.2.3-18-g5258 From 87826df0ec36fc28884b4ddbb3f3af41c4c2008f Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 15 Feb 2012 16:23:57 +0100 Subject: btrfs: delalloc for page dirtied out-of-band in fixup worker We encountered an issue that was easily observable on s/390 systems but could really happen anywhere. The timing just seemed to hit reliably on s/390 with limited memory. The gist is that when an unexpected set_page_dirty() happened, we'd run into the BUG() in btrfs_writepage_fixup_worker since it wasn't properly set up for delalloc. This patch does the following: - Performs the missing delalloc in the fixup worker - Allow the start hook to return -EBUSY which informs __extent_writepage that it should mark the page skipped and not to redirty it. This is required since the fixup worker can fail with -ENOSPC and the page will have already been redirtied. That causes an Oops in drop_outstanding_extents later. Retrying the fixup worker could lead to an infinite loop. Deferring the page redirty also saves us some cycles since the page would be stuck in a resubmit-redirty loop until the fixup worker completes. It's not harmful, just wasteful. - If the fixup worker fails, we mark the page and mapping as errored, and end the writeback, similar to what we would do had the page actually been submitted to writeback. Signed-off-by: Jeff Mahoney --- fs/btrfs/extent_io.c | 65 ++++++++++++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 1 + fs/btrfs/inode.c | 14 +++++++++-- 3 files changed, 53 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fcf77e1ded4..89ba79fb945 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2161,6 +2161,38 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, /* lots and lots of room for performance fixes in the end_bio funcs */ +int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +{ + int uptodate = (err == 0); + struct extent_io_tree *tree; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(NULL, page, + start, end, NULL); + /* Writeback already completed */ + if (ret == 0) + return 1; + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); + ClearPageUptodate(page); + SetPageError(page); + } + return 0; +} + /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -2172,13 +2204,11 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, */ static void end_bio_extent_writepage(struct bio *bio, int err) { - int uptodate = err == 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct extent_io_tree *tree; u64 start; u64 end; int whole_page; - int ret; do { struct page *page = bvec->bv_page; @@ -2195,28 +2225,9 @@ static void end_bio_extent_writepage(struct bio *bio, int err) if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (tree->ops && tree->ops->writepage_end_io_hook) { - ret = tree->ops->writepage_end_io_hook(page, start, - end, NULL, uptodate); - if (ret) - uptodate = 0; - } - - if (!uptodate && tree->ops && - tree->ops->writepage_io_failed_hook) { - ret = tree->ops->writepage_io_failed_hook(bio, page, - start, end, NULL); - if (ret == 0) { - uptodate = (err == 0); - continue; - } - } - if (!uptodate) { - clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS); - ClearPageUptodate(page); - SetPageError(page); - } + if (end_extent_writepage(page, err, start, end)) + continue; if (whole_page) end_page_writeback(page); @@ -2818,8 +2829,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); - if (ret == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + if (ret) { + /* Fixup worker will requeue */ + if (ret == -EBUSY) + wbc->pages_skipped++; + else + redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bc6a042cb6f..cecc3518c12 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -319,4 +319,5 @@ struct btrfs_mapping_tree; int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, u64 length, u64 logical, struct page *page, int mirror_num); +int end_extent_writepage(struct page *page, int err, u64 start, u64 end); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7405753ec5d..bf392e53261 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1555,6 +1555,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; + int ret; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; @@ -1582,12 +1583,21 @@ again: page_end, &cached_state, GFP_NOFS); unlock_page(page); btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); goto again; } - BUG(); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) { + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + ClearPageChecked(page); + goto out; + } + btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); + set_page_dirty(page); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -1630,7 +1640,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) fixup->work.func = btrfs_writepage_fixup_worker; fixup->page = page; btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); - return -EAGAIN; + return -EBUSY; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, -- cgit v1.2.3-18-g5258 From c08782dacd7a098f2b8bca7f4a57a5b402e9e1e5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 26 Jan 2012 15:01:12 -0500 Subject: btrfs: fix structs where bitfields and spinlock/atomic share 8B word On ia64, powerpc64 and sparc64 the bitfield is modified through a RMW cycle and current gcc rewrites the adjacent 4B word, which in case of a spinlock or atomic has disaterous effect. https://lkml.org/lkml/2012/2/1/220 Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_map.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3c2cbf7b666..8e4457e0478 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -886,7 +886,7 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned int full:1; + unsigned int full; }; /* diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 33a7890b1f4..1195f09761f 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,8 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - unsigned int in_tree:1; - unsigned int compress_type:4; + unsigned int in_tree; + unsigned int compress_type; }; struct extent_map_tree { -- cgit v1.2.3-18-g5258 From 8a3344269465b26761b74493cfbeeaa75d821614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 7 Oct 2011 18:06:13 +0200 Subject: btrfs: silence warning in raid array setup Raid array setup code creates an extent buffer in an usual way. When the PAGE_CACHE_SIZE is > super block size, the extent pages are not marked up-to-date, which triggers a WARN_ON in the following write_extent_buffer call. Add an explicit up-to-date call to silence the warning. Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ffdb154dae..d8f282b5baa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4357,6 +4357,20 @@ int btrfs_read_sys_array(struct btrfs_root *root) return -ENOMEM; btrfs_set_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); + /* + * The sb extent buffer is artifical and just used to read the system array. + * btrfs_set_buffer_uptodate() call does not properly mark all it's + * pages up-to-date when the page is larger: extent does not cover the + * whole page and consequently check_page_uptodate does not find all + * the page's extents up-to-date (the hole beyond sb), + * write_extent_buffer then triggers a WARN_ON. + * + * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, + * but sb spans only this function. Add an explicit SetPageUptodate call + * to silence the warning eg. on PowerPC 64. + */ + if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) + SetPageUptodate(sb->first_page); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit v1.2.3-18-g5258 From 12fc9d0923ca70ae8960bccebac09d5c12f8c4d4 Mon Sep 17 00:00:00 2001 From: Florian Albrechtskirchinger Date: Fri, 10 Feb 2012 22:15:54 +0100 Subject: btrfs: honor umask when creating subvol root Set the subvol root inode permissions based on the current umask. --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf392e53261..6e0ee9b0d74 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6706,8 +6706,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, int err; u64 index = 0; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, S_IFDIR | 0700, &index); + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, + new_dirid, new_dirid, + S_IFDIR | (~current_umask() & S_IRWXUGO), + &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; -- cgit v1.2.3-18-g5258 From 013bd4c336ad0d30e9e41f9cff0dbc1858934e75 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 10:11:40 +0900 Subject: Btrfs: fix return value check of extent_io_ops This patch adds the check on the return value of extent_io_ops. Signed-off-by: Tsutomu Itoh --- fs/btrfs/extent_io.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 89ba79fb945..b05d35a7c0f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2154,9 +2154,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, failrec->this_mirror, num_copies, failrec->in_validation); - tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, - failrec->bio_flags, 0); - return 0; + ret = tree->ops->submit_bio_hook(inode, read_mode, bio, + failrec->this_mirror, + failrec->bio_flags, 0); + return ret; } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2790,9 +2791,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, page, delalloc_start, - delalloc_end, &page_started, - &nr_written); + ret = tree->ops->fill_delalloc(inode, page, + delalloc_start, + delalloc_end, + &page_started, + &nr_written); + BUG_ON(ret); /* * delalloc_end is already one less than the total * length, so we don't subtract one from -- cgit v1.2.3-18-g5258 From 600a45e1d5e376f679ff9ecc4ce9452710a6d27c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 16 Feb 2012 15:01:24 +0800 Subject: Btrfs: fix deadlock on page lock when doing auto-defragment When I ran xfstests circularly on a auto-defragment btrfs, the deadlock happened. Steps to reproduce: [tty0] # export MOUNT_OPTIONS="-o autodefrag" # export TEST_DEV= # export TEST_DIR= # export SCRATCH_DEV= # export SCRATCH_MNT= # while [ 1 ] > do > ./check 091 127 263 > sleep 1 > done [tty1] # while [ 1 ] > do > echo 3 > /proc/sys/vm/drop_caches > done Several hours later, the test processes will hang on, and the deadlock will happen on page lock. The reason is that: Auto defrag task Flush thread Test task btrfs_writepages() add ordered extent (including page 1, 2) set page 1 writeback set page 2 writeback endio_fn() end page 2 writeback release page 2 lock page 1 alloc and lock page 2 page 2 is not uptodate btrfs_readpage() start ordered extent() btrfs_writepages() try to lock page 1 so deadlock happens. Fix this bug by unlocking the page which is in writeback, and re-locking it after the writeback end. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b06a5ca8af..e9bdb8b783e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -862,6 +862,7 @@ static int cluster_pages_for_defrag(struct inode *inode, int i_done; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; + struct extent_io_tree *tree; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); if (isize == 0) @@ -872,18 +873,34 @@ static int cluster_pages_for_defrag(struct inode *inode, num_pages << PAGE_CACHE_SHIFT); if (ret) return ret; -again: - ret = 0; i_done = 0; + tree = &BTRFS_I(inode)->io_tree; /* step one, lock all the pages */ for (i = 0; i < num_pages; i++) { struct page *page; +again: page = find_or_create_page(inode->i_mapping, - start_index + i, mask); + start_index + i, mask); if (!page) break; + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + while (1) { + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, + page_start); + unlock_extent(tree, page_start, page_end, GFP_NOFS); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -894,15 +911,22 @@ again: break; } } + isize = i_size_read(inode); file_end = (isize - 1) >> PAGE_CACHE_SHIFT; - if (!isize || page->index > file_end || - page->mapping != inode->i_mapping) { + if (!isize || page->index > file_end) { /* whoops, we blew past eof, skip this page */ unlock_page(page); page_cache_release(page); break; } + + if (page->mapping != inode->i_mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + pages[i] = page; i_done++; } @@ -925,25 +949,6 @@ again: lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); - if (ordered && - ordered->file_offset + ordered->len > page_start && - ordered->file_offset < page_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, - &cached_state, GFP_NOFS); - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - page_cache_release(pages[i]); - } - btrfs_wait_ordered_range(inode, page_start, - page_end - page_start); - goto again; - } - if (ordered) - btrfs_put_ordered_extent(ordered); - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, -- cgit v1.2.3-18-g5258 From 285190d99fef696ec8b0041787387f013cb71d67 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 16 Feb 2012 16:23:58 +0900 Subject: Btrfs: check return value of lookup_extent_mapping() correctly This patch corrects error checking of lookup_extent_mapping(). Signed-off-by: Tsutomu Itoh --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f1c5a0b2d..d02c27cd14c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -588,6 +588,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); read_unlock(&em_tree->lock); + if (!em) + return -EIO; compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b05d35a7c0f..8d6f55fbd28 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3308,7 +3308,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, len = end - start + 1; write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); - if (IS_ERR_OR_NULL(em)) { + if (!em) { write_unlock(&map->lock); break; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8f282b5baa..cd040bf3fd8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1954,7 +1954,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(em->start > chunk_offset || + BUG_ON(!em || em->start > chunk_offset || em->start + em->len < chunk_offset); map = (struct map_lookup *)em->bdev; -- cgit v1.2.3-18-g5258 From 0449314a9cc5a7fb0bd42e2175a3c46f68f3a2b0 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:37 +0800 Subject: Btrfs: skip states when they does not contain bits to clear Clearing a range's bits is different with setting them, since we don't need to touch them when states do not contain bits we want. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8d6f55fbd28..fe14285b53f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -513,6 +513,15 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; + if (state->end < end && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + + /* the state doesn't have the wanted bits, go ahead */ + if (!(state->state & bits)) + goto next; + /* * | ---- desired range ---- | * | state | or @@ -565,12 +574,8 @@ hit_next: goto out; } - if (state->end < end && prealloc && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - set |= clear_state_bit(tree, state, &bits, wake); +next: if (last_end == (u64)-1) goto out; start = last_end + 1; -- cgit v1.2.3-18-g5258 From 9d47c7671dc555e198c7347a173ed37316e0c4c1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:38 +0800 Subject: Btrfs: kick out redundant stuff in convert_extent_bit clear_state_bit will do merge_state for us, so kick out the redundant one. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe14285b53f..37259ff5cd7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -966,8 +966,6 @@ hit_next: set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - - merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -1012,7 +1010,6 @@ hit_next: if (state->end <= end) { set_state_bits(tree, state, &bits); clear_state_bit(tree, state, &clear_bits, 0); - merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1073,8 +1070,6 @@ hit_next: set_state_bits(tree, prealloc, &bits); clear_state_bit(tree, prealloc, &clear_bits, 0); - - merge_state(tree, prealloc); prealloc = NULL; goto out; } -- cgit v1.2.3-18-g5258 From d9b0218f6cb682aa6a4ada2bfc5a25fdf3018563 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:39 +0800 Subject: Btrfs: fix a bug on overcommit stuff When overcommitting, we should check the sum of pinned space and bytes for delayed item. Signed-off-by: Liu Bo --- fs/btrfs/extent-tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60bfe2d6854..dc083f55bcf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3611,12 +3611,15 @@ static int may_commit_transaction(struct btrfs_root *root, if (space_info != delayed_rsv->space_info) return -ENOSPC; + spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); - if (delayed_rsv->size < bytes) { + if (space_info->bytes_pinned + delayed_rsv->size < bytes) { spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); return -ENOSPC; } spin_unlock(&delayed_rsv->lock); + spin_unlock(&space_info->lock); commit: trans = btrfs_join_transaction(root); -- cgit v1.2.3-18-g5258 From 4a26620df451ad46151ad21d711ed43e963c004e Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Sat, 5 Nov 2011 13:45:08 -0400 Subject: eCryptfs: Improve statfs reporting statfs() calls on eCryptfs files returned the wrong filesystem type and, when using filename encryption, the wrong maximum filename length. If mount-wide filename encryption is enabled, the cipher block size and the lower filesystem's max filename length will determine the max eCryptfs filename length. Pre-tested, known good lengths are used when the lower filesystem's namelen is 255 and a cipher with 8 or 16 byte block sizes is used. In other, less common cases, we fall back to a safe rounded-down estimate when determining the eCryptfs namelen. https://launchpad.net/bugs/885744 Signed-off-by: Tyler Hicks Reported-by: Kees Cook Reviewed-by: Kees Cook Reviewed-by: John Johansen --- fs/ecryptfs/crypto.c | 68 ++++++++++++++++++++++++++++++++++++++----- fs/ecryptfs/ecryptfs_kernel.h | 6 ++++ fs/ecryptfs/keystore.c | 9 ++---- fs/ecryptfs/super.c | 14 ++++++++- 4 files changed, 83 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 63ab2451064..ea993128155 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1990,6 +1990,17 @@ out: return; } +static size_t ecryptfs_max_decoded_size(size_t encoded_size) +{ + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + return ((encoded_size + 1) * 3) / 4; +} + /** * ecryptfs_decode_from_filename * @dst: If NULL, this function only sets @dst_size and returns. If @@ -2008,13 +2019,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, size_t dst_byte_offset = 0; if (dst == NULL) { - /* Not exact; conservatively long. Every block of 4 - * encoded characters decodes into a block of 3 - * decoded characters. This segment of code provides - * the caller with the maximum amount of allocated - * space that @dst will need to point to in a - * subsequent call. */ - (*dst_size) = (((src_size + 1) * 3) / 4); + (*dst_size) = ecryptfs_max_decoded_size(src_size); goto out; } while (src_byte_offset < src_size) { @@ -2239,3 +2244,52 @@ out_free: out: return rc; } + +#define ENC_NAME_MAX_BLOCKLEN_8_OR_16 143 + +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + struct blkcipher_desc desc; + struct mutex *tfm_mutex; + size_t cipher_blocksize; + int rc; + + if (!(mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)) { + (*namelen) = lower_namelen; + return 0; + } + + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&desc.tfm, &tfm_mutex, + mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + (*namelen) = 0; + return rc; + } + + mutex_lock(tfm_mutex); + cipher_blocksize = crypto_blkcipher_blocksize(desc.tfm); + mutex_unlock(tfm_mutex); + + /* Return an exact amount for the common cases */ + if (lower_namelen == NAME_MAX + && (cipher_blocksize == 8 || cipher_blocksize == 16)) { + (*namelen) = ENC_NAME_MAX_BLOCKLEN_8_OR_16; + return 0; + } + + /* Return a safe estimate for the uncommon cases */ + (*namelen) = lower_namelen; + (*namelen) -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + /* Since this is the max decoded size, subtract 1 "decoded block" len */ + (*namelen) = ecryptfs_max_decoded_size(*namelen) - 3; + (*namelen) -= ECRYPTFS_TAG_70_MAX_METADATA_SIZE; + (*namelen) -= ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES; + /* Worst case is that the filename is padded nearly a full block size */ + (*namelen) -= cipher_blocksize - 1; + + if ((*namelen) < 0) + (*namelen) = 0; + + return 0; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a2362df58ae..867b64c5d84 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -162,6 +162,10 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define MD5_DIGEST_SIZE 16 #define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_TAG_70_MIN_METADATA_SIZE (1 + ECRYPTFS_MIN_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) +#define ECRYPTFS_TAG_70_MAX_METADATA_SIZE (1 + ECRYPTFS_MAX_PKT_LEN_SIZE \ + + ECRYPTFS_SIG_SIZE + 1 + 1) #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." #define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 #define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." @@ -701,6 +705,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, size_t *packet_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *data, size_t max_packet_size); +int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 8e3b943e330..2333203a120 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -679,10 +679,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, * Octets N3-N4: Block-aligned encrypted filename * - Consists of a minimum number of random characters, a \0 * separator, and then the filename */ - s->max_packet_size = (1 /* Tag 70 identifier */ - + 3 /* Max Tag 70 packet size */ - + ECRYPTFS_SIG_SIZE /* FNEK sig */ - + 1 /* Cipher identifier */ + s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE + s->block_aligned_filename_size); if (dest == NULL) { (*packet_size) = s->max_packet_size; @@ -934,10 +931,10 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; - if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " "at least [%d]\n", __func__, max_packet_size, - (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + ECRYPTFS_TAG_70_MIN_METADATA_SIZE); rc = -EINVAL; goto out; } diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 9df7fd6e0c3..cf152823bbf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include "ecryptfs_kernel.h" struct kmem_cache *ecryptfs_inode_info_cache; @@ -102,10 +104,20 @@ static void ecryptfs_destroy_inode(struct inode *inode) static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + int rc; if (!lower_dentry->d_sb->s_op->statfs) return -ENOSYS; - return lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + + rc = lower_dentry->d_sb->s_op->statfs(lower_dentry, buf); + if (rc) + return rc; + + buf->f_type = ECRYPTFS_SUPER_MAGIC; + rc = ecryptfs_set_f_namelen(&buf->f_namelen, buf->f_namelen, + &ecryptfs_superblock_to_private(dentry->d_sb)->mount_crypt_stat); + + return rc; } /** -- cgit v1.2.3-18-g5258 From 545d680938be1e86a6c5250701ce9abaf360c495 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 7 Feb 2012 17:55:40 -0600 Subject: eCryptfs: Copy up lower inode attrs after setting lower xattr After passing through a ->setxattr() call, eCryptfs needs to copy the inode attributes from the lower inode to the eCryptfs inode, as they may have changed in the lower filesystem's ->setxattr() path. One example is if an extended attribute containing a POSIX Access Control List is being set. The new ACL may cause the lower filesystem to modify the mode of the lower inode and the eCryptfs inode would need to be updated to reflect the new mode. https://launchpad.net/bugs/926292 Signed-off-by: Tyler Hicks Reported-by: Sebastien Bacher Cc: John Johansen Cc: --- fs/ecryptfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 19892d7d2ed..ab35b113003 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1085,6 +1085,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); + if (!rc) + fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); out: return rc; } -- cgit v1.2.3-18-g5258 From 465c9343c5b746ec2325a220fa3e50cc647d2db7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 10 Feb 2012 13:39:50 +0800 Subject: ecryptfs: remove the second argument of k[un]map_atomic() Signed-off-by: Cong Wang Signed-off-by: Tyler Hicks --- fs/ecryptfs/mmap.c | 4 ++-- fs/ecryptfs/read_write.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 10ec695ccd6..a46b3a8fee1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -150,7 +150,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page, KM_USER0); + page_virt = kmap_atomic(page); memset(page_virt, 0, PAGE_CACHE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -163,7 +163,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt, KM_USER0); + kunmap_atomic(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 5c0106f7577..b2a34a192f4 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -156,7 +156,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0); + ecryptfs_page_virt = kmap_atomic(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -179,7 +179,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt, KM_USER0); + kunmap_atomic(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); -- cgit v1.2.3-18-g5258 From f86f36a6ae625eda87a13e1ea102a908e08f491b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 14 Feb 2012 20:33:19 -0500 Subject: NFSv4.1: Fix a NFSv4.1 session initialisation regression Commit aacd553 (NFSv4.1: cleanup init and reset of session slot tables) introduces a regression in the session initialisation code. New tables now find their sequence ids initialised to 0, rather than the mandated value of 1 (see RFC5661). Fix the problem by merging nfs4_reset_slot_table() and nfs4_init_slot_table(). Since the tbl->max_slots is initialised to 0, the test in nfs4_reset_slot_table for max_reqs != tbl->max_slots will automatically pass for an empty table. Reported-by: Vitaliy Gusev Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 107 +++++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d202e04aca9..b4d67feab90 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5008,37 +5008,53 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) return status; } +static struct nfs4_slot *nfs4_alloc_slots(u32 max_slots, gfp_t gfp_flags) +{ + return kcalloc(max_slots, sizeof(struct nfs4_slot), gfp_flags); +} + +static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl, + struct nfs4_slot *new, + u32 max_slots, + u32 ivalue) +{ + struct nfs4_slot *old = NULL; + u32 i; + + spin_lock(&tbl->slot_tbl_lock); + if (new) { + old = tbl->slots; + tbl->slots = new; + tbl->max_slots = max_slots; + } + tbl->highest_used_slotid = -1; /* no slot is currently used */ + for (i = 0; i < tbl->max_slots; i++) + tbl->slots[i].seq_nr = ivalue; + spin_unlock(&tbl->slot_tbl_lock); + kfree(old); +} + /* - * Reset a slot table + * (re)Initialise a slot table */ -static int nfs4_reset_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, - int ivalue) +static int nfs4_realloc_slot_table(struct nfs4_slot_table *tbl, u32 max_reqs, + u32 ivalue) { struct nfs4_slot *new = NULL; - int i; - int ret = 0; + int ret = -ENOMEM; dprintk("--> %s: max_reqs=%u, tbl->max_slots %d\n", __func__, max_reqs, tbl->max_slots); /* Does the newly negotiated max_reqs match the existing slot table? */ if (max_reqs != tbl->max_slots) { - ret = -ENOMEM; - new = kmalloc(max_reqs * sizeof(struct nfs4_slot), - GFP_NOFS); + new = nfs4_alloc_slots(max_reqs, GFP_NOFS); if (!new) goto out; - ret = 0; - kfree(tbl->slots); - } - spin_lock(&tbl->slot_tbl_lock); - if (new) { - tbl->slots = new; - tbl->max_slots = max_reqs; } - for (i = 0; i < tbl->max_slots; ++i) - tbl->slots[i].seq_nr = ivalue; - spin_unlock(&tbl->slot_tbl_lock); + ret = 0; + + nfs4_add_and_init_slots(tbl, new, max_reqs, ivalue); dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, tbl, tbl->slots, tbl->max_slots); out: @@ -5060,36 +5076,6 @@ static void nfs4_destroy_slot_tables(struct nfs4_session *session) return; } -/* - * Initialize slot table - */ -static int nfs4_init_slot_table(struct nfs4_slot_table *tbl, - int max_slots, int ivalue) -{ - struct nfs4_slot *slot; - int ret = -ENOMEM; - - BUG_ON(max_slots > NFS4_MAX_SLOT_TABLE); - - dprintk("--> %s: max_reqs=%u\n", __func__, max_slots); - - slot = kcalloc(max_slots, sizeof(struct nfs4_slot), GFP_NOFS); - if (!slot) - goto out; - ret = 0; - - spin_lock(&tbl->slot_tbl_lock); - tbl->max_slots = max_slots; - tbl->slots = slot; - tbl->highest_used_slotid = -1; /* no slot is currently used */ - spin_unlock(&tbl->slot_tbl_lock); - dprintk("%s: tbl=%p slots=%p max_slots=%d\n", __func__, - tbl, tbl->slots, tbl->max_slots); -out: - dprintk("<-- %s: return %d\n", __func__, ret); - return ret; -} - /* * Initialize or reset the forechannel and backchannel tables */ @@ -5101,25 +5087,16 @@ static int nfs4_setup_session_slot_tables(struct nfs4_session *ses) dprintk("--> %s\n", __func__); /* Fore channel */ tbl = &ses->fc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ - return status; - } else { - status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) - return status; - } + status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); + if (status) /* -ENOMEM */ + return status; /* Back channel */ tbl = &ses->bc_slot_table; - if (tbl->slots == NULL) { - status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0); - if (status) - /* Fore and back channel share a connection so get - * both slot tables or neither */ - nfs4_destroy_slot_tables(ses); - } else - status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + status = nfs4_realloc_slot_table(tbl, ses->bc_attrs.max_reqs, 0); + if (status && tbl->slots == NULL) + /* Fore and back channel share a connection so get + * both slot tables or neither */ + nfs4_destroy_slot_tables(ses); return status; } -- cgit v1.2.3-18-g5258 From abe9a6d57b4544ac208401f9c0a4262814db2be4 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 16 Feb 2012 11:17:05 -0500 Subject: NFSv4: fix server_scope memory leak server_scope would never be freed if nfs4_check_cl_exchange_flags() returned non-zero Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b4d67feab90..ec9f6ef6c5d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4883,8 +4883,10 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->cl_rpcclient->cl_auth->au_flavor); res.server_scope = kzalloc(sizeof(struct server_scope), GFP_KERNEL); - if (unlikely(!res.server_scope)) - return -ENOMEM; + if (unlikely(!res.server_scope)) { + status = -ENOMEM; + goto out; + } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); if (!status) @@ -4901,12 +4903,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) clp->server_scope = NULL; } - if (!clp->server_scope) + if (!clp->server_scope) { clp->server_scope = res.server_scope; - else - kfree(res.server_scope); + goto out; + } } - + kfree(res.server_scope); +out: dprintk("<-- %s status= %d\n", __func__, status); return status; } -- cgit v1.2.3-18-g5258 From 692e5759a43b916f0b66bcb39b2957499992381e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 16 Feb 2012 18:34:36 +0800 Subject: Btrfs: be less strict on finding next node in clear_extent_bit In clear_extent_bit, it is enough that next node is adjacent in tree level. Signed-off-by: Liu Bo --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 37259ff5cd7..45ca8f9b0d6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -582,8 +582,7 @@ next: if (start <= end && next_node) { state = rb_entry(next_node, struct extent_state, rb_node); - if (state->start == start) - goto hit_next; + goto hit_next; } goto search_again; -- cgit v1.2.3-18-g5258 From 20f12d8ac01917d96860f352f67eddd912df0afb Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Mon, 6 Feb 2012 12:50:07 +0000 Subject: xfs: change available ranges of softlimit and hardlimit in quota check In general, quota allows us to use disk blocks and inodes up to each limit, that is, they are available if they don't exceed their limitations. Current xfs sets their available ranges to lower than them except disk inode quota check. So, this patch changes the ranges to not beyond them. Signed-off-by: Mitsuo Hayasaka Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_dquot.c | 24 ++++++++++++------------ fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_qm_syscalls.c | 4 ++-- fs/xfs/xfs_trans_dquot.c | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index cbcb7bea38e..53db20ee3e7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -139,10 +139,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_btimer) { if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_softlimit))) || (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) >= + (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_btimelimit); @@ -151,10 +151,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_softlimit))) && (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) < + (be64_to_cpu(d->d_bcount) <= be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = 0; } @@ -162,10 +162,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_itimer) { if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_softlimit))) || (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) >= + (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_itimelimit); @@ -174,10 +174,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_softlimit))) && (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) < + (be64_to_cpu(d->d_icount) <= be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = 0; } @@ -185,10 +185,10 @@ xfs_qm_adjust_dqtimers( if (!d->d_rtbtimer) { if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_softlimit))) || (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) >= + (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(get_seconds() + mp->m_quotainfo->qi_rtbtimelimit); @@ -197,10 +197,10 @@ xfs_qm_adjust_dqtimers( } } else { if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_softlimit))) && (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) < + (be64_to_cpu(d->d_rtbcount) <= be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 15ff5392fb6..0ed9ee77937 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1981,7 +1981,7 @@ xfs_qm_dqcheck( if (!errs && ddq->d_id) { if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= + be64_to_cpu(ddq->d_bcount) > be64_to_cpu(ddq->d_blk_softlimit)) { if (!ddq->d_btimer) { if (flags & XFS_QMOPT_DOWARN) @@ -1992,7 +1992,7 @@ xfs_qm_dqcheck( } } if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= + be64_to_cpu(ddq->d_icount) > be64_to_cpu(ddq->d_ino_softlimit)) { if (!ddq->d_itimer) { if (flags & XFS_QMOPT_DOWARN) @@ -2003,7 +2003,7 @@ xfs_qm_dqcheck( } } if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= + be64_to_cpu(ddq->d_rtbcount) > be64_to_cpu(ddq->d_rtb_softlimit)) { if (!ddq->d_rtbtimer) { if (flags & XFS_QMOPT_DOWARN) diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index eafbcff81f3..711a86e39ff 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -813,11 +813,11 @@ xfs_qm_export_dquot( (XFS_IS_OQUOTA_ENFORCED(mp) && (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { - if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && + if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { ASSERT(dst->d_btimer != 0); } - if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && + if (((int) dst->d_icount > (int) dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { ASSERT(dst->d_itimer != 0); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 4d00ee67792..85255536b4b 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -649,12 +649,12 @@ xfs_trans_dqresv( * nblks. */ if (hardlimit > 0ULL && - hardlimit <= nblks + *resbcountp) { + hardlimit < nblks + *resbcountp) { xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } if (softlimit > 0ULL && - softlimit <= nblks + *resbcountp) { + softlimit < nblks + *resbcountp) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, -- cgit v1.2.3-18-g5258 From c922bbc819324558e61402a7a76c10c550ca61bc Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Mon, 6 Feb 2012 12:50:30 +0000 Subject: xfs: make inode quota check more general The xfs checks quota when reserving disk blocks and inodes. In the block reservation, it checks if the total number of blocks including current usage and new reservation exceed quota. In the inode reservation, it checks using the total number of inodes including only current usage without new reservation. However, this inode quota check works well since the caller of xfs_trans_dquot() always sets the argument of the number of new inode reservation to 1 or 0 and inode is reserved one by one in current xfs. To make it more general, this patch changes it to the same way as the block quota check. Signed-off-by: Mitsuo Hayasaka Cc: Ben Myers Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Mark Tinguely Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_trans_dquot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 85255536b4b..c4ba366d24e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -677,11 +677,13 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) { + if (hardlimit > 0ULL && + hardlimit < ninos + count) { xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; } - if (softlimit > 0ULL && count >= softlimit) { + if (softlimit > 0ULL && + softlimit < ninos + count) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { xfs_quota_warn(mp, dqp, -- cgit v1.2.3-18-g5258 From faf309009e2e18d30c032b7d9479f29b91677c37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 21 Feb 2012 17:24:20 -0800 Subject: sys_poll: fix incorrect type for 'timeout' parameter The 'poll()' system call timeout parameter is supposed to be 'int', not 'long'. Now, the reason this matters is that right now 32-bit compat mode is broken on at least x86-64, because the 32-bit code just calls 'sys_poll()' directly on x86-64, and the 32-bit argument will have been zero-extended, turning a signed 'int' into a large unsigned 'long' value. We could just introduce a 'compat_sys_poll()' function for this, and that may eventually be what we have to do, but since the actual standard poll() semantics is *supposed* to be 'int', and since at least on x86-64 glibc sign-extends the argument before invocing the system call (so nobody can actually use a 64-bit timeout value in user space _anyway_, even in 64-bit binaries), the simpler solution would seem to be to just fix the definition of the system call to match what it should have been from the very start. If it turns out that somebody somehow circumvents the user-level libc 64-bit sign extension and actually uses a large unsigned 64-bit timeout despite that not being how poll() is supposed to work, we will need to do the compat_sys_poll() approach. Reported-by: Thomas Meyer Acked-by: Eric Dumazet Signed-off-by: Linus Torvalds --- fs/select.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index d33418fdc85..e782258d0de 100644 --- a/fs/select.c +++ b/fs/select.c @@ -912,7 +912,7 @@ static long do_restart_poll(struct restart_block *restart_block) } SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, - long, timeout_msecs) + int, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; -- cgit v1.2.3-18-g5258 From 0e37579506b9e0c11a4a95c8df6b156b2d6ccf9b Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Wed, 22 Feb 2012 10:49:58 +0000 Subject: NTFS: Remove unused variable. Found by Coverity software (http://scan.coverity.com). Signed-off-by: Anton Altaparmakov --- fs/ntfs/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 5a4a8af5c40..f907611cca7 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -1239,7 +1239,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol) { MFT_REF mref; struct inode *vi; - ntfs_inode *ni; struct page *page; u32 *kaddr, *kend; ntfs_name *name = NULL; @@ -1290,7 +1289,6 @@ static int check_windows_hibernation_status(ntfs_volume *vol) "is not the system volume.", i_size_read(vi)); goto iput_out; } - ni = NTFS_I(vi); page = ntfs_map_page(vi->i_mapping, 0); if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); -- cgit v1.2.3-18-g5258 From 45d95bcd7ac961eef26374a0ad6100cda55bcea1 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Wed, 22 Feb 2012 11:15:43 +0000 Subject: NTFS: Do not dereference pointer before checking for NULL. Found by Coverity software (http://scan.coverity.com). Signed-off-by: Anton Altaparmakov --- fs/ntfs/attrib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index f14fde2b03d..e0281992ddc 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,7 +1,7 @@ /** * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -345,10 +345,10 @@ LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, unsigned long flags; bool is_retry = false; + BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", ni->mft_no, (unsigned long long)vcn, write_locked ? "write" : "read"); - BUG_ON(!ni); BUG_ON(!NInoNonResident(ni)); BUG_ON(vcn < 0); if (!ni->runlist.rl) { @@ -469,9 +469,9 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, int err = 0; bool is_retry = false; + BUG_ON(!ni); ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); - BUG_ON(!ni); BUG_ON(!NInoNonResident(ni)); BUG_ON(vcn < 0); if (!ni->runlist.rl) { -- cgit v1.2.3-18-g5258 From fe66a05a06795bd3b788404d69ea7709f46a1609 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 08:40:56 -0500 Subject: Btrfs: improve error handling for btrfs_insert_dir_item callers This allows us to gracefully continue if we aren't able to insert directory items, both for normal files/dirs and snapshots. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 20 +++++++++++++++++++- fs/btrfs/transaction.c | 13 +++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6e0ee9b0d74..cbeb2e36ceb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4585,7 +4585,8 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_insert_dir_item(trans, root, name, name_len, parent_inode, &key, btrfs_inode_type(inode), index); - BUG_ON(ret); + if (ret) + goto fail_dir_item; btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); @@ -4593,6 +4594,23 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, parent_inode); } return ret; + +fail_dir_item: + if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { + u64 local_index; + int err; + err = btrfs_del_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_ino, &local_index, name, name_len); + + } else if (add_backref) { + u64 local_index; + int err; + + err = btrfs_del_inode_ref(trans, root, name, name_len, + ino, parent_ino, &local_index); + } + return ret; } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 287a6728b1a..016977beee5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -915,7 +915,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, BTRFS_FT_DIR, index); - BUG_ON(ret); + if (ret) { + pending->error = -EEXIST; + dput(parent); + goto fail; + } btrfs_i_size_write(parent_inode, parent_inode->i_size + dentry->d_name.len * 2); @@ -993,12 +997,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, { struct btrfs_pending_snapshot *pending; struct list_head *head = &trans->transaction->pending_snapshots; - int ret; - list_for_each_entry(pending, head, list) { - ret = create_pending_snapshot(trans, fs_info, pending); - BUG_ON(ret); - } + list_for_each_entry(pending, head, list) + create_pending_snapshot(trans, fs_info, pending); return 0; } -- cgit v1.2.3-18-g5258 From a6b0d5c8dbfd428717fc4db4c36757783f391c7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 20:53:43 -0500 Subject: Btrfs: make sure we update latest_bdev When we are setting up the mount, we close all the devices that were not actually part of the metadata we found. But, we don't make sure that one of those devices wasn't fs_devices->latest_bdev, which means we can do a use after free on the one we closed. This updates latest_bdev as it goes. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 ++++++ fs/btrfs/volumes.c | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 58d0678dfcb..b801d29f3f1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2305,6 +2305,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices); + if (!fs_devices->latest_bdev) { + printk(KERN_CRIT "btrfs: failed to read devices on %s\n", + sb->s_id); + goto fail_tree_roots; + } + retry_root_backup: blocksize = btrfs_level_size(tree_root, btrfs_super_root_level(disk_super)); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cd040bf3fd8..7eefdef8a14 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -459,12 +459,23 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *next; + struct block_device *latest_bdev = NULL; + u64 latest_devid = 0; + u64 latest_transid = 0; + mutex_lock(&uuid_mutex); again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->in_fs_metadata) + if (device->in_fs_metadata) { + if (!latest_transid || + device->generation > latest_transid) { + latest_devid = device->devid; + latest_transid = device->generation; + latest_bdev = device->bdev; + } continue; + } if (device->bdev) { blkdev_put(device->bdev, device->mode); @@ -487,6 +498,10 @@ again: goto again; } + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + mutex_unlock(&uuid_mutex); return 0; } -- cgit v1.2.3-18-g5258 From 16780cabb877dbd0c8c5e9ff9bdebd6c5bdd1a7b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Feb 2012 22:14:55 -0500 Subject: Btrfs: add extra sanity checks on the path names in btrfs_mksubvol Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e9bdb8b783e..05446f77f99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1333,6 +1333,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, goto out; } + if (name[0] == '.' && + (namelen == 1 || (name[1] == '.' && namelen == 2))) { + ret = -EEXIST; + goto out; + } + if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, NULL, transid, readonly); -- cgit v1.2.3-18-g5258 From 506531905296d6aee84480c879b25ea98c3f9db6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 22 Feb 2012 12:36:24 -0500 Subject: Btrfs: clear the extent uptodate bits during parent transid failures If btrfs reads a block and finds a parent transid mismatch, it clears the uptodate flags on the extent buffer, and the pages inside it. But we only clear the uptodate bits in the state tree if the block straddles more than one page. This is from an old optimization from to reduce contention on the extent state tree. But it is buggy because the code that retries a read from a different copy of the block is going to find the uptodate state bits set and skip the IO. The end result of the bug is that we'll never actually read the good copy (if there is one). The fix here is to always clear the uptodate state bits, which is safe because this code is only called when the parent transid fails. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 45ca8f9b0d6..a55fbe6252d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3871,10 +3871,9 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - if (eb_straddles_pages(eb)) { - clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - cached_state, GFP_NOFS); - } + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + cached_state, GFP_NOFS); + for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if (page) -- cgit v1.2.3-18-g5258 From 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 23 Feb 2012 10:49:04 -0500 Subject: Btrfs: increase the global block reserve estimates When doing IO with large amounts of data fragmentation, the global block reserve calulations are too low. This increases them to avoid ENOSPC crashes. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc083f55bcf..079e5a1c343 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4108,7 +4108,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div64_u64(meta_used, 3) * 2; return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } -- cgit v1.2.3-18-g5258 From 37fbf4bfb826372c3ca6c09d8a015d1fe9f5e186 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 23 Feb 2012 23:40:05 +0000 Subject: Restore direct_io / truncate locking API With kernel 3.1, Christoph removed i_alloc_sem and replaced it with calls (namely inode_dio_wait() and inode_dio_done()) which are EXPORT_SYMBOL_GPL() thus they cannot be used by non-GPL file systems and further inode_dio_wait() was pushed from notify_change() into the file system ->setattr() method but no non-GPL file system can make this call. That means non-GPL file systems cannot exist any more unless they do not use any VFS functionality related to reading/writing as far as I can tell or at least as long as they want to implement direct i/o. Both Linus and Al (and others) have said on LKML that this breakage of the VFS API should not have happened and that the change was simply missed as it was not documented in the change logs of the patches that did those changes. This patch changes the two function exports in question to be EXPORT_SYMBOL() thus restoring the VFS API as it used to be - accessible for all modules. Christoph, who introduced the two functions and exported them GPL-only is CC-ed on this patch to give him the opportunity to object to the symbols being changed in this manner if he did indeed intend them to be GPL-only and does not want them to become available to all modules. Signed-off-by: Anton Altaparmakov CC: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/direct-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 4a588dbd11b..f4aadd15b61 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -173,7 +173,7 @@ void inode_dio_wait(struct inode *inode) if (atomic_read(&inode->i_dio_count)) __inode_dio_wait(inode); } -EXPORT_SYMBOL_GPL(inode_dio_wait); +EXPORT_SYMBOL(inode_dio_wait); /* * inode_dio_done - signal finish of a direct I/O requests @@ -187,7 +187,7 @@ void inode_dio_done(struct inode *inode) if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } -EXPORT_SYMBOL_GPL(inode_dio_done); +EXPORT_SYMBOL(inode_dio_done); /* * How many pages are in the queue? -- cgit v1.2.3-18-g5258 From 9b556248ecb059095e000f77c4b84899feb50098 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Fri, 24 Feb 2012 09:17:09 +0000 Subject: NTFS: Correct two spelling errors "dealocate" to "deallocate" in mft.c. From: Masanari Iida Signed-off-by: Anton Altaparmakov --- fs/ntfs/mft.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 382857f9c7d..3014a36a255 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -1367,7 +1367,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft " "bitmap."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate " + ntfs_error(vol->sb, "Failed to deallocate " "allocated cluster.%s", es); NVolSetErrors(vol); } @@ -1805,7 +1805,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) ntfs_error(vol->sb, "Failed to merge runlists for mft data " "attribute."); if (ntfs_cluster_free_from_rl(vol, rl2)) { - ntfs_error(vol->sb, "Failed to dealocate clusters " + ntfs_error(vol->sb, "Failed to deallocate clusters " "from the mft data attribute.%s", es); NVolSetErrors(vol); } -- cgit v1.2.3-18-g5258 From e77266e4c4be6f9dc91bf688bce015a8babd5fe0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Feb 2012 10:39:05 -0500 Subject: Btrfs: fix compiler warnings on 32 bit systems The enospc tracing code added some interesting uses of u64 pointer casts. Signed-off-by: Chris Mason --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/extent-tree.c | 35 +++++++++++++++++++---------------- fs/btrfs/inode-map.c | 6 ++++-- fs/btrfs/transaction.c | 3 ++- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b669a7d8e49..d986824bb2b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -644,7 +644,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - int ret; + int ret = 0; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 079e5a1c343..37e0a800d34 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3312,7 +3312,8 @@ commit_trans: } data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 1); + (u64)(unsigned long)data_sinfo, + bytes, 1); spin_unlock(&data_sinfo->lock); return 0; @@ -3333,7 +3334,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) spin_lock(&data_sinfo->lock); data_sinfo->bytes_may_use -= bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", - (u64)data_sinfo, bytes, 0); + (u64)(unsigned long)data_sinfo, + bytes, 0); spin_unlock(&data_sinfo->lock); } @@ -3698,9 +3700,9 @@ again: if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { /* @@ -3769,9 +3771,9 @@ again: if (used + num_bytes < space_info->total_bytes + avail) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, - "space_info", - (u64)space_info, - orig_bytes, 1); + "space_info", + (u64)(unsigned long)space_info, + orig_bytes, 1); ret = 0; } else { wait_ordered = true; @@ -3916,8 +3918,8 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); space_info->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)space_info, - num_bytes, 0); + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->reservation_progress++; spin_unlock(&space_info->lock); } @@ -4135,14 +4137,14 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->reserved += num_bytes; sinfo->bytes_may_use += num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 1); + (u64)(unsigned long)sinfo, num_bytes, 1); } if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_may_use -= num_bytes; trace_btrfs_space_reservation(fs_info, "space_info", - (u64)sinfo, num_bytes, 0); + (u64)(unsigned long)sinfo, num_bytes, 0); sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; @@ -4195,7 +4197,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "transaction", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); trans->bytes_reserved = 0; @@ -4713,9 +4716,9 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_reserved += num_bytes; if (reserve == RESERVE_ALLOC) { trace_btrfs_space_reservation(cache->fs_info, - "space_info", - (u64)space_info, - num_bytes, 0); + "space_info", + (u64)(unsigned long)space_info, + num_bytes, 0); space_info->bytes_may_use -= num_bytes; } } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 213ffa86ce1..ee15d88b33d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -438,7 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, trans->bytes_reserved); if (ret) goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); @@ -500,7 +501,8 @@ again: out_put: iput(inode); out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans, + trace_btrfs_space_reservation(root->fs_info, "ino_cache", + (u64)(unsigned long)trans, trans->bytes_reserved, 0); btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 016977beee5..04b77e3ceb7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -327,7 +327,8 @@ again: if (num_bytes) { trace_btrfs_space_reservation(root->fs_info, "transaction", - (u64)h, num_bytes, 1); + (u64)(unsigned long)h, + num_bytes, 1); h->block_rsv = &root->fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; } -- cgit v1.2.3-18-g5258 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf2..34bbfc6dd8d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451d..79c1eea98a3 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; -- cgit v1.2.3-18-g5258 From 971316f0503a5c50633d07b83b6db2f15a3a5b00 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:29 +0100 Subject: epoll: ep_unregister_pollwait() can use the freed pwq->whead signalfd_cleanup() ensures that ->signalfd_wqh is not used, but this is not enough. eppoll_entry->whead still points to the memory we are going to free, ep_unregister_pollwait()->remove_wait_queue() is obviously unsafe. Change ep_poll_callback(POLLFREE) to set eppoll_entry->whead = NULL, change ep_unregister_pollwait() to check pwq->whead != NULL under rcu_read_lock() before remove_wait_queue(). We add the new helper, ep_remove_wait_queue(), for this. This works because sighand_cachep is SLAB_DESTROY_BY_RCU and because ->signalfd_wqh is initialized in sighand_ctor(), not in copy_sighand. ep_unregister_pollwait()->remove_wait_queue() can play with already freed and potentially reused ->sighand, but this is fine. This memory must have the valid ->signalfd_wqh until rcu_read_unlock(). Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 30 +++++++++++++++++++++++++++--- fs/signalfd.c | 6 +++++- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 34bbfc6dd8d..ea54cdef04d 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -320,6 +320,11 @@ static inline int ep_is_linked(struct list_head *p) return !list_empty(p); } +static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait); +} + /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { @@ -467,6 +472,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +static void ep_remove_wait_queue(struct eppoll_entry *pwq) +{ + wait_queue_head_t *whead; + + rcu_read_lock(); + /* If it is cleared by POLLFREE, it should be rcu-safe */ + whead = rcu_dereference(pwq->whead); + if (whead) + remove_wait_queue(whead, &pwq->wait); + rcu_read_unlock(); +} + /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from @@ -481,7 +498,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) pwq = list_first_entry(lsthead, struct eppoll_entry, llink); list_del(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); + ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } @@ -842,9 +859,16 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - /* the caller holds eppoll_entry->whead->lock */ - if ((unsigned long)key & POLLFREE) + if ((unsigned long)key & POLLFREE) { + ep_pwq_from_wait(wait)->whead = NULL; + /* + * whead = NULL above can race with ep_remove_wait_queue() + * which can do another remove_wait_queue() after us, so we + * can't use __remove_wait_queue(). whead->lock is held by + * the caller. + */ list_del_init(&wait->task_list); + } spin_lock_irqsave(&ep->lock, flags); diff --git a/fs/signalfd.c b/fs/signalfd.c index 79c1eea98a3..7ae2a574cb2 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -33,7 +33,11 @@ void signalfd_cleanup(struct sighand_struct *sighand) { wait_queue_head_t *wqh = &sighand->signalfd_wqh; - + /* + * The lockless check can race with remove_wait_queue() in progress, + * but in this case its caller should run under rcu_read_lock() and + * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + */ if (likely(!waitqueue_active(wqh))) return; -- cgit v1.2.3-18-g5258 From a32744d4abae24572eff7269bc17895c41bd0085 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 22 Feb 2012 20:45:44 +0800 Subject: autofs: work around unhappy compat problem on x86-64 When the autofs protocol version 5 packet type was added in commit 5c0a32fc2cd0 ("autofs4: add new packet type for v5 communications"), it obvously tried quite hard to be word-size agnostic, and uses explicitly sized fields that are all correctly aligned. However, with the final "char name[NAME_MAX+1]" array at the end, the actual size of the structure ends up being not very well defined: because the struct isn't marked 'packed', doing a "sizeof()" on it will align the size of the struct up to the biggest alignment of the members it has. And despite all the members being the same, the alignment of them is different: a "__u64" has 4-byte alignment on x86-32, but native 8-byte alignment on x86-64. And while 'NAME_MAX+1' ends up being a nice round number (256), the name[] array starts out a 4-byte aligned. End result: the "packed" size of the structure is 300 bytes: 4-byte, but not 8-byte aligned. As a result, despite all the fields being in the same place on all architectures, sizeof() will round up that size to 304 bytes on architectures that have 8-byte alignment for u64. Note that this is *not* a problem for 32-bit compat mode on POWER, since there __u64 is 8-byte aligned even in 32-bit mode. But on x86, 32-bit and 64-bit alignment is different for 64-bit entities, and as a result the structure that has exactly the same layout has different sizes. So on x86-64, but no other architecture, we will just subtract 4 from the size of the structure when running in a compat task. That way we will write the properly sized packet that user mode expects. Not pretty. Sadly, this very subtle, and unnecessary, size difference has been encoded in user space that wants to read packets of *exactly* the right size, and will refuse to touch anything else. Reported-and-tested-by: Thomas Meyer Signed-off-by: Ian Kent Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 1 + fs/autofs4/dev-ioctl.c | 1 + fs/autofs4/inode.c | 2 ++ fs/autofs4/waitq.c | 22 +++++++++++++++++++--- 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d8d8e7ba6a1..eb1cc92cd67 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -110,6 +110,7 @@ struct autofs_sb_info { int sub_version; int min_proto; int max_proto; + int compat_daemon; unsigned long exp_timeout; unsigned int type; int reghost_enabled; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 76741d8d778..85f1fcdb30e 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -385,6 +385,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; + sbi->compat_daemon = is_compat_task(); } out: mutex_unlock(&sbi->wq_mutex); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index e16980b00b8..06858d95512 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "autofs_i.h" #include @@ -224,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; + sbi->compat_daemon = is_compat_task(); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index da8876d38a7..9c098db4334 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -91,7 +91,24 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - + +/* + * The autofs_v5 packet was misdesigned. + * + * The packets are identical on x86-32 and x86-64, but have different + * alignment. Which means that 'sizeof()' will give different results. + * Fix it up for the case of running 32-bit user mode on a 64-bit kernel. + */ +static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi) +{ + size_t pktsz = sizeof(struct autofs_v5_packet); +#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) + if (sbi->compat_daemon > 0) + pktsz -= 4; +#endif + return pktsz; +} + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -155,8 +172,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - pktsz = sizeof(*packet); - + pktsz = autofs_v5_packet_size(sbi); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); -- cgit v1.2.3-18-g5258 From 6de2ce423157d06f73d570ef7044f08c2f8697da Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 17 Feb 2012 16:13:30 +0300 Subject: CIFS: Fix mkdir/rmdir bug for the non-POSIX case Currently we do inc/drop_nlink for a parent directory for every mkdir/rmdir calls. That's wrong when Unix extensions are disabled because in this case a server doesn't follow the same semantic and returns the old value on the next QueryInfo request. As the result, we update our value with the server one and then decrement it on every rmdir call - go to negative nlink values. Fix this by removing inc/drop_nlink for the parent directory from mkdir/rmdir, setting it for a revalidation and ignoring NumberOfLinks for directories when Unix extensions are disabled. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a5f54b7d982..745da3d0653 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -534,6 +534,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; + /* + * Server can return wrong NumberOfLinks value for directories + * when Unix extensions are disabled - fake it. + */ + fattr->cf_nlink = 2; } else { fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode; fattr->cf_dtype = DT_REG; @@ -541,9 +546,9 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, /* clear write bits if ATTR_READONLY is set */ if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~(S_IWUGO); - } - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); + } fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; @@ -1322,7 +1327,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need to set uid/gid */ - inc_nlink(inode); cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb); cifs_fill_uniqueid(inode->i_sb, &fattr); @@ -1355,7 +1359,6 @@ mkdir_retry_old: d_drop(direntry); } else { mkdir_get_info: - inc_nlink(inode); if (pTcon->unix_ext) rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); @@ -1436,6 +1439,11 @@ mkdir_get_info: } } mkdir_out: + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + CIFS_I(inode)->time = 0; kfree(full_path); FreeXid(xid); cifs_put_tlink(tlink); @@ -1475,7 +1483,6 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_put_tlink(tlink); if (!rc) { - drop_nlink(inode); spin_lock(&direntry->d_inode->i_lock); i_size_write(direntry->d_inode, 0); clear_nlink(direntry->d_inode); @@ -1483,12 +1490,15 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) } cifsInode = CIFS_I(direntry->d_inode); - cifsInode->time = 0; /* force revalidate to go get info when - needed */ + /* force revalidate to go get info when needed */ + cifsInode->time = 0; cifsInode = CIFS_I(inode); - cifsInode->time = 0; /* force revalidate to get parent dir info - since cached search results now invalid */ + /* + * Force revalidate to get parent dir info when needed since cached + * attributes are invalid now. + */ + cifsInode->time = 0; direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); -- cgit v1.2.3-18-g5258 From 5bccda0ebc7c0331b81ac47d39e4b920b198b2cd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Feb 2012 09:37:45 -0500 Subject: cifs: fix dentry refcount leak when opening a FIFO on lookup The cifs code will attempt to open files on lookup under certain circumstances. What happens though if we find that the file we opened was actually a FIFO or other special file? Currently, the open filehandle just ends up being leaked leading to a dentry refcount mismatch and oops on umount. Fix this by having the code close the filehandle on the server if it turns out not to be a regular file. While we're at it, change this spaghetti if statement into a switch too. Cc: stable@vger.kernel.org Reported-by: CAI Qian Tested-by: CAI Qian Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 63a196b97d5..bc7e24420ac 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -584,10 +584,26 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, * If either that or op not supported returned, follow * the normal lookup. */ - if ((rc == 0) || (rc == -ENOENT)) + switch (rc) { + case 0: + /* + * The server may allow us to open things like + * FIFOs, but the client isn't set up to deal + * with that. If it's not a regular file, just + * close it and proceed as if it were a normal + * lookup. + */ + if (newInode && !S_ISREG(newInode->i_mode)) { + CIFSSMBClose(xid, pTcon, fileHandle); + break; + } + case -ENOENT: posix_open = true; - else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) + case -EOPNOTSUPP: + break; + default: pTcon->broken_posix_open = true; + } } if (!posix_open) rc = cifs_get_inode_info_unix(&newInode, full_path, -- cgit v1.2.3-18-g5258 From 4043b886b0740ded65f633fc4b7225d624c7e658 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 16 Jan 2012 15:46:21 +0000 Subject: GFS2: Fix race between lru_list and glock ref count This patch fixes a narrow race window between the glock ref count hitting zero and glocks being removed from the lru_list. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 376816fcd04..351a3e79778 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -167,14 +167,19 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { - spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + __gfs2_glock_remove_from_lru(gl); spin_unlock(&lru_lock); } @@ -217,11 +222,12 @@ void gfs2_glock_put(struct gfs2_glock *gl) struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); - if (atomic_dec_and_test(&gl->gl_ref)) { + if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { + __gfs2_glock_remove_from_lru(gl); + spin_unlock(&lru_lock); spin_lock_bucket(gl->gl_hash); hlist_bl_del_rcu(&gl->gl_list); spin_unlock_bucket(gl->gl_hash); - gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); -- cgit v1.2.3-18-g5258 From 718b97bd6b03445be53098e3c8f896aeebc304aa Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 16 Feb 2012 11:31:04 -0500 Subject: GFS2: Read in rindex if necessary during unlink This patch fixes a problem whereby you were unable to delete files until other file system operations were done (such as statfs, touch, writes, etc.) that caused the rindex to be read in. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index a7d611b93f0..c87faf48f0a 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1035,14 +1035,19 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - int error = -EROFS; + int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - if (!rgd) + if (!rgd) { + error = -EROFS; goto out_inodes; + } gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); -- cgit v1.2.3-18-g5258 From 9e73f571ea3afffca78c1f54128d57796e27532f Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 17 Feb 2012 09:15:52 -0500 Subject: GFS2: Ensure rindex is uptodate for fallocate This patch fixes a problem whereby gfs2_grow was failing and causing GFS2 to assert. The problem was that when GFS2's fallocate operation tried to acquire an "allocation" it made sure the rindex was up to date, and if not, it called gfs2_rindex_update. However, if the file being fallocated was the rindex itself, it was already locked at that point. By calling gfs2_rindex_update at an earlier point in time, we bring rindex up to date and thereby avoid trying to lock it when the "allocation" is acquired. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c5fb3597f69..7f906c8b02a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -772,6 +772,11 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; + error = gfs2_rindex_update(sdp); + if (error) { + fs_warn(sdp, "rindex update returns %d\n", error); + return error; + } gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) -- cgit v1.2.3-18-g5258 From a365fbf354907430e6852f0c373b4b3eeff81ba3 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 24 Feb 2012 15:09:14 +0000 Subject: GFS2: Read resource groups on mount This makes mount take slightly longer, but at the same time, the first write to the filesystem will be faster too. It also means that if there is a problem in the resource index, then we can refuse to mount rather than having to try and report that when the first write occurs. In addition, to avoid recursive locking, we hvae to take account of instances when the rindex glock may already be held when we are trying to update the rbtree of resource groups. Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 5 ----- fs/gfs2/inode.c | 14 +++----------- fs/gfs2/ops_fstype.c | 5 +++++ fs/gfs2/rgrp.c | 13 +++++++++---- 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7f906c8b02a..c5fb3597f69 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -772,11 +772,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; - error = gfs2_rindex_update(sdp); - if (error) { - fs_warn(sdp, "rindex update returns %d\n", error); - return error; - } gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c87faf48f0a..56987460cda 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -391,10 +391,6 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) int error; int dblocks = 1; - error = gfs2_rindex_update(sdp); - if (error) - fs_warn(sdp, "rindex update returns %d\n", error); - error = gfs2_inplace_reserve(dip, RES_DINODE); if (error) goto out; @@ -1035,19 +1031,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - int error; + int error = -EROFS; - error = gfs2_rindex_update(sdp); - if (error) - return error; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - if (!rgd) { - error = -EROFS; + if (!rgd) goto out_inodes; - } + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6aacf3f230a..24f609c9ef9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -800,6 +800,11 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) fs_err(sdp, "can't get quota file inode: %d\n", error); goto fail_rindex; } + + error = gfs2_rindex_update(sdp); + if (error) + goto fail_qinode; + return 0; fail_qinode: diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 981bfa32121..49ada95209d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -683,16 +683,21 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp) struct gfs2_glock *gl = ip->i_gl; struct gfs2_holder ri_gh; int error = 0; + int unlock_required = 0; /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { mutex_lock(&sdp->sd_rindex_mutex); - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); - if (error) - return error; + if (!gfs2_glock_is_locked_by_me(gl)) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + unlock_required = 1; + } if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); - gfs2_glock_dq_uninit(&ri_gh); + if (unlock_required) + gfs2_glock_dq_uninit(&ri_gh); mutex_unlock(&sdp->sd_rindex_mutex); } -- cgit v1.2.3-18-g5258 From 164974a8f2a482f1abcb027c6d1a89dd79b14297 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 28 Feb 2012 16:31:12 -0800 Subject: ecryptfs: fix printk format warning for size_t Fix printk format warning (from Linus's suggestion): on i386: fs/ecryptfs/miscdev.c:433:38: warning: format '%lu' expects type 'long unsigned int', but argument 4 has type 'unsigned int' and on x86_64: fs/ecryptfs/miscdev.c:433:38: warning: format '%u' expects type 'unsigned int', but argument 4 has type 'long unsigned int' Signed-off-by: Randy Dunlap Cc: Geert Uytterhoeven Cc: Tyler Hicks Cc: Dustin Kirkland Cc: ecryptfs@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/ecryptfs/miscdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 349209dc6a9..3a06f4043df 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -429,7 +429,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, goto memdup; } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { printk(KERN_WARNING "%s: Acceptable packet size range is " - "[%d-%lu], but amount of data written is [%zu].", + "[%d-%zu], but amount of data written is [%zu].", __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); return -EINVAL; } -- cgit v1.2.3-18-g5258 From c8e252586f8d5de906385d8cf6385fee289a825e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 2 Mar 2012 10:43:48 -0800 Subject: regset: Prevent null pointer reference on readonly regsets The regset common infrastructure assumed that regsets would always have .get and .set methods, but not necessarily .active methods. Unfortunately people have since written regsets without .set methods. Rather than putting in stub functions everywhere, handle regsets with null .get or .set methods explicitly. Signed-off-by: H. Peter Anvin Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bcb884e2d61..07d096c4992 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1421,7 +1421,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && + if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; size_t size = regset->n * regset->size; -- cgit v1.2.3-18-g5258 From 8966be90304b394fd6a2c5af7b6b3abe2df3889c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:23:30 -0800 Subject: vfs: trivial __d_lookup_rcu() cleanups These don't change any semantics, but they clean up the code a bit and mark some arguments appropriately 'const'. They came up as I was doing the word-at-a-time dcache name accessor code, and cleaning this up now allows me to send out a smaller relevant interesting patch for the experimental stuff. Signed-off-by: Linus Torvalds --- fs/dcache.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index fe19ac13f75..138be96e25b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -104,7 +104,7 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(struct dentry *parent, +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned long hash) { hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; @@ -1717,8 +1717,9 @@ EXPORT_SYMBOL(d_add_ci); * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. */ -struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, - unsigned *seq, struct inode **inode) +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp, struct inode **inode) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -1748,6 +1749,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; struct inode *i; const char *tname; int tlen; @@ -1756,7 +1758,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, continue; seqretry: - *seq = read_seqcount_begin(&dentry->d_seq); + seq = read_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) @@ -1771,7 +1773,7 @@ seqretry: * edge of memory when walking. If we could load this * atomically some other way, we could drop this check. */ - if (read_seqcount_retry(&dentry->d_seq, *seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto seqretry; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, @@ -1788,6 +1790,7 @@ seqretry: * order to do anything useful with the returned dentry * anyway. */ + *seqp = seq; *inode = i; return dentry; } -- cgit v1.2.3-18-g5258 From 0145acc202ca613b23b5383e55df3c32a92ad1bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:32:59 -0800 Subject: vfs: uninline full_name_hash() .. and also use it in lookup_one_len() rather than open-coding it. There aren't any performance-critical users, so inlining it is silly. But it wouldn't matter if it wasn't for the fact that the word-at-a-time dentry name patches want to conditionally replace the function, and uninlining it sets the stage for that. So again, this is a preparatory patch that doesn't change any semantics, and only prepares for a much cleaner and testable word-at-a-time dentry name accessor patch. Signed-off-by: Linus Torvalds --- fs/namei.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index a780ea515c4..ec72fa1acb1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,14 @@ static inline int can_lookup(struct inode *inode) return 1; } +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(*name++, hash); + return end_name_hash(hash); +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1775,24 +1783,21 @@ static struct dentry *lookup_hash(struct nameidata *nd) struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; - unsigned long hash; unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); this.name = name; this.len = len; + this.hash = full_name_hash(name, len); if (!len) return ERR_PTR(-EACCES); - hash = init_name_hash(); while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return ERR_PTR(-EACCES); - hash = partial_name_hash(c, hash); } - this.hash = end_name_hash(hash); /* * See if the low-level filesystem might want * to use its own hash.. -- cgit v1.2.3-18-g5258 From 200e9ef7ab51f3dce4f35f90ea458cf43ea83bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 14:49:24 -0800 Subject: vfs: split up name hashing in link_path_walk() into helper function The code in link_path_walk() that finds out the length and the hash of the next path component is some of the hottest code in the kernel. And I have a version of it that does things at the full width of the CPU wordsize at a time, but that means that we *really* want to split it up into a separate helper function. So this re-organizes the code a bit and splits the hashing part into a helper function called "hash_name()". It returns the length of the pathname component, while at the same time computing and writing the hash to the appropriate location. The code generation is slightly changed by this patch, but generally for the better - and the added abstraction actually makes the code easier to read too. And the new interface is well suited for replacing just the "hash_name()" function with alternative implementations. Signed-off-by: Linus Torvalds --- fs/namei.c | 52 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index ec72fa1acb1..71807dc7e40 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1382,6 +1382,25 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) return end_name_hash(hash); } +/* + * We know there's a real path component here of at least + * one character. + */ +static inline unsigned long hash_name(const char *name, unsigned int *hashp) +{ + unsigned long hash = init_name_hash(); + unsigned long len = 0, c; + + c = (unsigned char)*name; + do { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } while (c && c != '/'); + *hashp = end_name_hash(hash); + return len; +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1402,31 +1421,22 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - unsigned long hash; struct qstr this; - unsigned int c; + long len; int type; err = may_lookup(nd); if (err) break; + len = hash_name(name, &this.hash); this.name = name; - c = *(const unsigned char *)name; - - hash = init_name_hash(); - do { - name++; - hash = partial_name_hash(c, hash); - c = *(const unsigned char *)name; - } while (c && (c != '/')); - this.len = name - (const char *) this.name; - this.hash = end_name_hash(hash); + this.len = len; type = LAST_NORM; - if (this.name[0] == '.') switch (this.len) { + if (name[0] == '.') switch (len) { case 2: - if (this.name[1] == '.') { + if (name[1] == '.') { type = LAST_DOTDOT; nd->flags |= LOOKUP_JUMPED; } @@ -1445,12 +1455,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) } } - /* remove trailing slashes? */ - if (!c) + if (!name[len]) goto last_component; - while (*++name == '/'); - if (!*name) + /* + * If it wasn't NUL, we know it was '/'. Skip that + * slash, and continue until no more slashes. + */ + do { + len++; + } while (unlikely(name[len] == '/')); + if (!name[len]) goto last_component; + name += len; err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); if (err < 0) -- cgit v1.2.3-18-g5258 From ae942ae71934fddd0639160c24f6efa703d5784e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 2 Mar 2012 19:40:57 -0800 Subject: vfs: export full_name_hash() function to modules Commit 5707c87f "vfs: uninline full_name_hash()" broke the modular build, because it needs exporting now that it isn't inlined any more. Reported-by: Tetsuo Handa Signed-off-by: Linus Torvalds --- fs/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 71807dc7e40..e2ba62820a0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1381,6 +1381,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len) hash = partial_name_hash(*name++, hash); return end_name_hash(hash); } +EXPORT_SYMBOL(full_name_hash); /* * We know there's a real path component here of at least -- cgit v1.2.3-18-g5258 From d3b010640e5c59b98d3b11229ba4cc2838dc7cbf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 3 Mar 2012 07:41:15 -0500 Subject: btrfs: fix locking issues in find_parent_nodes() - We might unlock head->mutex while it was not locked - We might leave the function without unlocking delayed_refs->lock Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 98f6bf10bbd..0436c12da8c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -583,7 +583,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_key info_key = { 0 }; struct btrfs_delayed_ref_root *delayed_refs = NULL; - struct btrfs_delayed_ref_head *head = NULL; + struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; struct list_head prefs_delayed; @@ -607,6 +607,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, * at a specified point in time */ again: + head = NULL; + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -635,8 +637,10 @@ again: goto again; } ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed); - if (ret) + if (ret) { + spin_unlock(&delayed_refs->lock); goto out; + } } spin_unlock(&delayed_refs->lock); -- cgit v1.2.3-18-g5258 From a175423c831ea582c06784d1e172d2ce1d79923a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 28 Feb 2012 12:42:44 -0500 Subject: Btrfs: fix casting error in scrub reada code The reada code from scrub was casting down a u64 to an unsigned long so it could insert it into a radix tree. What it really wanted to do was cast down the result of a shift, instead of casting down the u64. The bug resulted in trying to insert our reada struct into the wrong place, which caused soft lockups and other problems. Signed-off-by: Chris Mason --- fs/btrfs/reada.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 2373b39a132..22db04550f6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -305,7 +305,7 @@ again: spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, - (unsigned long)zone->end >> PAGE_CACHE_SHIFT, + (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); spin_unlock(&fs_info->reada_lock); -- cgit v1.2.3-18-g5258 From 5483f18e986ed5267b923bec12b407845181350b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 4 Mar 2012 15:51:42 -0800 Subject: vfs: move dentry_cmp from to fs/dcache.c It's only used inside fs/dcache.c, and we're going to play games with it for the word-at-a-time patches. This time we really don't even want to export it, because it really is an internal function to fs/dcache.c, and has been since it was introduced. Having it in that extremely hot header file (it's included in pretty much everything, thanks to ) is a disaster for testing different versions, and is utterly pointless. We really should have some kind of header file diet thing, where we figure out which parts of header files are really better off private and only result in more expensive compiles. Signed-off-by: Linus Torvalds --- fs/dcache.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 138be96e25b..bcbdb33fcc2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -137,6 +137,26 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, } #endif +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, + const unsigned char *ct, size_t tcount) +{ + if (scount != tcount) + return 1; + + do { + if (*cs != *ct) + return 1; + cs++; + ct++; + tcount--; + } while (tcount); + return 0; +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); -- cgit v1.2.3-18-g5258 From 6414fa6a150111750011f477899d370244da4171 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Mar 2012 06:38:42 +0000 Subject: aout: move setup_arg_pages() prior to reading/mapping the binary Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a6395bdb26a..1ff94054d35 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -259,6 +259,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; + retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); + return retval; + } + install_exec_creds(bprm); current->flags &= ~PF_FORKNOEXEC; @@ -352,13 +359,6 @@ beyond_if: return retval; } - retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); - return retval; - } - current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); #ifdef __alpha__ -- cgit v1.2.3-18-g5258 From 880641bb9da2473e9ecf6c708d993b29928c1b3c Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Mon, 5 Mar 2012 14:59:12 -0800 Subject: aio: wake up waiters when freeing unused kiocbs Bart Van Assche reported a hung fio process when either hot-removing storage or when interrupting the fio process itself. The (pruned) call trace for the latter looks like so: fio D 0000000000000001 0 6849 6848 0x00000004 ffff880092541b88 0000000000000046 ffff880000000000 ffff88012fa11dc0 ffff88012404be70 ffff880092541fd8 ffff880092541fd8 ffff880092541fd8 ffff880128b894d0 ffff88012404be70 ffff880092541b88 000000018106f24d Call Trace: schedule+0x3f/0x60 io_schedule+0x8f/0xd0 wait_for_all_aios+0xc0/0x100 exit_aio+0x55/0xc0 mmput+0x2d/0x110 exit_mm+0x10d/0x130 do_exit+0x671/0x860 do_group_exit+0x44/0xb0 get_signal_to_deliver+0x218/0x5a0 do_signal+0x65/0x700 do_notify_resume+0x65/0x80 int_signal+0x12/0x17 The problem lies with the allocation batching code. It will opportunistically allocate kiocbs, and then trim back the list of iocbs when there is not enough room in the completion ring to hold all of the events. In the case above, what happens is that the pruning back of events ends up freeing up the last active request and the context is marked as dead, so it is thus responsible for waking up waiters. Unfortunately, the code does not check for this condition, so we end up with a hung task. Signed-off-by: Jeff Moyer Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: [3.2.x only] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 969beb0e223..67e4b9047cc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -490,6 +490,8 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch) kmem_cache_free(kiocb_cachep, req); ctx->reqs_active--; } + if (unlikely(!ctx->reqs_active && ctx->dead)) + wake_up_all(&ctx->wait); spin_unlock_irq(&ctx->ctx_lock); } -- cgit v1.2.3-18-g5258 From c415c3b47ea2754659d915cca387a20999044163 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: introduce complete_vfork_done() No functional changes. Move the clear-and-complete-vfork_done code into the new trivial helper, complete_vfork_done(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 92ce83a11e9..dccdcec913e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1915,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion *vfork_done; int core_waiters = -EBUSY; init_completion(&core_state->startup); @@ -1934,11 +1933,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) * Make sure nobody is waiting for us to release the VM, * otherwise we can deadlock when we wait on each other */ - vfork_done = tsk->vfork_done; - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); if (core_waiters) wait_for_completion(&core_state->startup); -- cgit v1.2.3-18-g5258 From 57b59c4a1400fa6c34764eab2e35a8762dc05a09 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: coredump_wait: don't call complete_vfork_done() Now that CLONE_VFORK is killable, coredump_wait() no longer needs complete_vfork_done(). zap_threads() should find and kill all tasks with the same ->mm, this includes our parent if ->vfork_done is set. mm_release() becomes the only caller, unexport complete_vfork_done(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index dccdcec913e..153dee14fe5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1926,19 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (unlikely(core_waiters < 0)) - goto fail; - - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (tsk->vfork_done) - complete_vfork_done(tsk); - - if (core_waiters) + if (core_waiters > 0) wait_for_completion(&core_state->startup); -fail: + return core_waiters; } -- cgit v1.2.3-18-g5258 From 86b62a2cb4fc09037bbce2959d2992962396fd7f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 7 Mar 2012 05:16:35 +0000 Subject: aio: fix io_setup/io_destroy race Have ioctx_alloc() return an extra reference, so that caller would drop it on success and not bother with re-grabbing it on failure exit. The current code is obviously broken - io_destroy() from another thread that managed to guess the address io_setup() would've returned would free ioctx right under us; gets especially interesting if aio_context_t * we pass to io_setup() points to PROT_READ mapping, so put_user() fails and we end up doing io_destroy() on kioctx another thread has just got freed... Signed-off-by: Al Viro Acked-by: Benjamin LaHaise Reviewed-by: Jeff Moyer Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/aio.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 67e4b9047cc..f6578cb22d0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -273,7 +273,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) mm = ctx->mm = current->mm; atomic_inc(&mm->mm_count); - atomic_set(&ctx->users, 1); + atomic_set(&ctx->users, 2); spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->ring_info.ring_lock); init_waitqueue_head(&ctx->wait); @@ -1338,10 +1338,10 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = PTR_ERR(ioctx); if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); - if (!ret) + if (!ret) { + put_ioctx(ioctx); return 0; - - get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ + } io_destroy(ioctx); } -- cgit v1.2.3-18-g5258 From c7b285550544c22bc005ec20978472c9ac7138c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Mar 2012 17:51:19 +0000 Subject: aio: fix the "too late munmap()" race Current code has put_ioctx() called asynchronously from aio_fput_routine(); that's done *after* we have killed the request that used to pin ioctx, so there's nothing to stop io_destroy() waiting in wait_for_all_aios() from progressing. As the result, we can end up with async call of put_ioctx() being the last one and possibly happening during exit_mmap() or elf_core_dump(), neither of which expects stray munmap() being done to them... We do need to prevent _freeing_ ioctx until aio_fput_routine() is done with that, but that's all we care about - neither io_destroy() nor exit_aio() will progress past wait_for_all_aios() until aio_fput_routine() does really_put_req(), so the ioctx teardown won't be done until then and we don't care about the contents of ioctx past that point. Since actual freeing of these suckers is RCU-delayed, we don't need to bump ioctx refcount when request goes into list for async removal. All we need is rcu_read_lock held just over the ->ctx_lock-protected area in aio_fput_routine(). Signed-off-by: Al Viro Reviewed-by: Jeff Moyer Acked-by: Benjamin LaHaise Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/aio.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index f6578cb22d0..b9d64d89a04 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -228,12 +228,6 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -static inline void get_ioctx(struct kioctx *kioctx) -{ - BUG_ON(atomic_read(&kioctx->users) <= 0); - atomic_inc(&kioctx->users); -} - static inline int try_get_ioctx(struct kioctx *kioctx) { return atomic_inc_not_zero(&kioctx->users); @@ -609,11 +603,16 @@ static void aio_fput_routine(struct work_struct *data) fput(req->ki_filp); /* Link the iocb into the context's free list */ + rcu_read_lock(); spin_lock_irq(&ctx->ctx_lock); really_put_req(ctx, req); + /* + * at that point ctx might've been killed, but actual + * freeing is RCU'd + */ spin_unlock_irq(&ctx->ctx_lock); + rcu_read_unlock(); - put_ioctx(ctx); spin_lock_irq(&fput_lock); } spin_unlock_irq(&fput_lock); @@ -644,7 +643,6 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * this function will be executed w/out any aio kthread wakeup. */ if (unlikely(!fput_atomic(req->ki_filp))) { - get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); -- cgit v1.2.3-18-g5258