From 357b66fdc8ad4cea6e6336956a70742f961f0a4d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:34:34 -0500 Subject: ext4: ext4_split_extent should take care of extent zeroout When ext4_split_extent_at() ends up doing zeroout & conversion to initialized instead of split & conversion, ext4_split_extent() gets confused and can wrongly mark the extent back as uninitialized resulting in end IO code getting confused from large unwritten extents and may result in data loss. The example of problematic behavior is: lblk len lblk len ext4_split_extent() (ex=[1000,30,uninit], map=[1010,10]) ext4_split_extent_at() (split [1000,30,uninit] at 1020) ext4_ext_insert_extent() -> ENOSPC ext4_ext_zeroout() -> extent [1000,30] is now initialized ext4_split_extent_at() (split [1000,30,init] at 1010, MARK_UNINIT1 | MARK_UNINIT2) -> extent is split and parts marked as uninitialized Fix the problem by rechecking extent type after the first ext4_split_extent_at() returns. None of split_flags can not be applied to initialized extent so this patch also add BUG_ON to prevent similar issues in future. TESTCASE: https://github.com/dmonakhov/xfstests/commit/b8a55eb5ce28c6ff29e620ab090902fcd5833597 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 372b2cbee07..bef194a1443 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2943,6 +2943,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3061,19 +3065,26 @@ static int ext4_split_extent(handle_t *handle, if (err) goto out; } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) -- cgit v1.2.3-70-g09d2 From ec22ba8edb507395c95fbc617eea26a6b2d98797 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:36:06 -0500 Subject: ext4: disable merging of uninitialized extents Derived from Jan's patch:http://permalink.gmane.org/gmane.comp.file-systems.ext4/36470 Merging of uninitialized extents creates all sorts of interesting race possibilities when writeback / DIO races with fallocate. Thus ext4_convert_unwritten_extents_endio() has to deal with a case where extent to be converted needs to be split out first. That isn't nice for two reasons: 1) It may need allocation of extent tree block so ENOSPC is possible. 2) It complicates end_io handling code So we disable merging of uninitialized extents which allows us to simplify the code. Extents will get merged after they are converted to initialized ones. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bef194a1443..60818ed1f6a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) -- cgit v1.2.3-70-g09d2 From ff95ec22cd7faa0d8b58dcc4207f21502df7b00b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:41:05 -0500 Subject: ext4: add warning to ext4_convert_unwritten_extents_endio Splitting extents inside endio is a bad thing, but unfortunately it is still possible. In fact we are pretty close to the moment when all related issues will be fixed. Let's warn developer if it still the case. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 60818ed1f6a..265cb0e50c5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3387,8 +3387,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) -- cgit v1.2.3-70-g09d2 From de99fcce1da7933a90198b80a2e896754ea3bdc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Mar 2013 00:43:32 -0500 Subject: ext4: remove unnecessary wait for extent conversion in ext4_fallocate() Now that we don't merge uninitialized extents anymore, ext4_fallocate() is free to operate on the inode while there are still some extent conversions pending - it won't disturb them in any way. Reviewed-by: Zheng Liu Reviewed-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 265cb0e50c5..25c86aaa38d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4392,8 +4392,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; -- cgit v1.2.3-70-g09d2 From 6ca470d7b5e7639b7925b3202e796282703b6d5d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:50:47 -0500 Subject: ext4: invalidate extent status tree during extent migration mext_replace_branches() will change inode's extents layout so we have to drop corresponding cache. TESTCASE: 301'th xfstest was not yet accepted to official xfstest's branch and can be found here: https://github.com/dmonakhov/xfstests/commit/7b7efeee30a41109201e2040034e71db9b66ddc0 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/move_extent.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index d78c33eed7e..c1f15b203e9 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -666,6 +666,14 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, int replaced_count = 0; int dext_alen; + *err = ext4_es_remove_extent(orig_inode, from, count); + if (*err) + goto out; + + *err = ext4_es_remove_extent(donor_inode, from, count); + if (*err) + goto out; + /* Get the original extent for the block "orig_off" */ *err = get_ext_path(orig_inode, orig_off, &orig_path); if (*err) -- cgit v1.2.3-70-g09d2 From bd384364c1185ecb01f3b8242c915ccb5921c60d Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 20:48:59 -0400 Subject: ext4: avoid a potential overflow in ext4_es_can_be_merged() Check the length of an extent to avoid a potential overflow in ext4_es_can_be_merged(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents_status.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95796a1b752..37f9a2d8fd0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -333,17 +333,27 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (es1->es_lblk + es1->es_len != es2->es_lblk) + if (ext4_es_status(es1) != ext4_es_status(es2)) return 0; - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL) return 0; - if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && - (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk) return 0; - return 1; + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2))) + return 1; + + if (ext4_es_is_hole(es1)) + return 1; + + /* we need to check delayed extent is without unwritten status */ + if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1)) + return 1; + + return 0; } static struct extent_status * -- cgit v1.2.3-70-g09d2 From 921f266bc6bfe6ebb599c559f10443af314c19ec Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 10 Mar 2013 21:01:03 -0400 Subject: ext4: add self-testing infrastructure to do a sanity check This commit adds a self-testing infrastructure like extent tree does to do a sanity check for extent status tree. After status tree is as a extent cache, we'd better to make sure that it caches right result. After applied this commit, we will get a lot of messages when we run xfstests as below. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... kernel: ES cache assertation failed for inode: 230 es_cached ex [974/2/4781/20] != found ex [974/1/4781/1000] ... kernel: ES insert assertation failed for inode: 635 ex_status [0/45/21388/w] != es_status [44/1/21432/u] ... Signed-off-by: Dmitry Monakhov Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 6 ++ fs/ext4/inode.c | 96 ++++++++++++++++++++++++++ 3 files changed, 277 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 37f9a2d8fd0..d2a8cb74676 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -399,6 +399,179 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) return es; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_es_insert_extent_ext_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_ext_path *path = NULL; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + ext4_fsblk_t ee_start; + unsigned short ee_len; + int depth, ee_status, es_status; + + path = ext4_ext_find_extent(inode, es->es_lblk, NULL); + if (IS_ERR(path)) + return; + + depth = ext_depth(inode); + ex = path[depth].p_ext; + + if (ex) { + + ee_block = le32_to_cpu(ex->ee_block); + ee_start = ext4_ext_pblock(ex); + ee_len = ext4_ext_get_actual_len(ex); + + ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0; + es_status = ext4_es_is_unwritten(es) ? 1 : 0; + + /* + * Make sure ex and es are not overlap when we try to insert + * a delayed/hole extent. + */ + if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { + if (in_range(es->es_lblk, ee_block, ee_len)) { + pr_warn("ES insert assertation failed for " + "inode: %lu we can find an extent " + "at block [%d/%d/%llu/%c], but we " + "want to add an delayed/hole extent " + "[%d/%d/%llu/%llx]\n", + inode->i_ino, ee_block, ee_len, + ee_start, ee_status ? 'u' : 'w', + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + goto out; + } + + /* + * We don't check ee_block == es->es_lblk, etc. because es + * might be a part of whole extent, vice versa. + */ + if (es->es_lblk < ee_block || + ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + goto out; + } + + if (ee_status ^ es_status) { + pr_warn("ES insert assertation failed for inode: %lu " + "ex_status [%d/%d/%llu/%c] != " + "es_status [%d/%d/%llu/%c]\n", inode->i_ino, + ee_block, ee_len, ee_start, + ee_status ? 'u' : 'w', es->es_lblk, es->es_len, + ext4_es_pblock(es), es_status ? 'u' : 'w'); + } + } else { + /* + * We can't find an extent on disk. So we need to make sure + * that we don't want to add an written/unwritten extent. + */ + if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "can't find an extent at block %d but we want " + "to add an written/unwritten extent " + "[%d/%d/%llu/%llx]\n", inode->i_ino, + es->es_lblk, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + } + } +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } +} + +static void ext4_es_insert_extent_ind_check(struct inode *inode, + struct extent_status *es) +{ + struct ext4_map_blocks map; + int retval; + + /* + * Here we call ext4_ind_map_blocks to lookup a block mapping because + * 'Indirect' structure is defined in indirect.c. So we couldn't + * access direct/indirect tree from outside. It is too dirty to define + * this function in indirect.c file. + */ + + map.m_lblk = es->es_lblk; + map.m_len = es->es_len; + + retval = ext4_ind_map_blocks(NULL, inode, &map, 0); + if (retval > 0) { + if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) { + /* + * We want to add a delayed/hole extent but this + * block has been allocated. + */ + pr_warn("ES insert assertation failed for inode: %lu " + "We can find blocks but we want to add a " + "delayed/hole extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } else if (ext4_es_is_written(es)) { + if (retval != es->es_len) { + pr_warn("ES insert assertation failed for " + "inode: %lu retval %d != es_len %d\n", + inode->i_ino, retval, es->es_len); + return; + } + if (map.m_pblk != ext4_es_pblock(es)) { + pr_warn("ES insert assertation failed for " + "inode: %lu m_pblk %llu != " + "es_pblk %llu\n", + inode->i_ino, map.m_pblk, + ext4_es_pblock(es)); + return; + } + } else { + /* + * We don't need to check unwritten extent because + * indirect-based file doesn't have it. + */ + BUG_ON(1); + } + } else if (retval == 0) { + if (ext4_es_is_written(es)) { + pr_warn("ES insert assertation failed for inode: %lu " + "We can't find the block but we want to add " + "an written extent [%d/%d/%llu/%llx]\n", + inode->i_ino, es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); + return; + } + } +} + +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ + /* + * We don't need to worry about the race condition because + * caller takes i_data_sem locking. + */ + BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ext4_es_insert_extent_ext_check(inode, es); + else + ext4_es_insert_extent_ind_check(inode, es); +} +#else +static inline void ext4_es_insert_extent_check(struct inode *inode, + struct extent_status *es) +{ +} +#endif + static int __es_insert_extent(struct inode *inode, struct extent_status *newes) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; @@ -481,6 +654,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_store_status(&newes, status); trace_ext4_es_insert_extent(inode, &newes); + ext4_es_insert_extent_check(inode, &newes); + write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); if (err != 0) diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f190dfe969d..56140ad4150 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,6 +20,12 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * With ES_AGGRESSIVE_TEST defined, the result of es caching will be + * checked with old map_block's result. + */ +#define ES_AGGRESSIVE_TEST__ + /* * These flags live in the high bits of extent_status.es_pblk */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 95a0c62c568..3186a43fa4b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -482,6 +482,58 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, return num; } +#ifdef ES_AGGRESSIVE_TEST +static void ext4_map_blocks_es_recheck(handle_t *handle, + struct inode *inode, + struct ext4_map_blocks *es_map, + struct ext4_map_blocks *map, + int flags) +{ + int retval; + + map->m_flags = 0; + /* + * There is a race window that the result is not the same. + * e.g. xfstests #223 when dioread_nolock enables. The reason + * is that we lookup a block mapping in extent status tree with + * out taking i_data_sem. So at the time the unwritten extent + * could be converted. + */ + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + down_read((&EXT4_I(inode)->i_data_sem)); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } else { + retval = ext4_ind_map_blocks(handle, inode, map, flags & + EXT4_GET_BLOCKS_KEEP_SIZE); + } + if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) + up_read((&EXT4_I(inode)->i_data_sem)); + /* + * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag + * because it shouldn't be marked in es_map->m_flags. + */ + map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); + + /* + * We don't check m_len because extent will be collpased in status + * tree. So the m_len might not equal. + */ + if (es_map->m_lblk != map->m_lblk || + es_map->m_flags != map->m_flags || + es_map->m_pblk != map->m_pblk) { + printk("ES cache assertation failed for inode: %lu " + "es_cached ex [%d/%d/%llu/%x] != " + "found ex [%d/%d/%llu/%x] retval %d flags %x\n", + inode->i_ino, es_map->m_lblk, es_map->m_len, + es_map->m_pblk, es_map->m_flags, map->m_lblk, + map->m_len, map->m_pblk, map->m_flags, + retval, flags); + } +} +#endif /* ES_AGGRESSIVE_TEST */ + /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -509,6 +561,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, { struct extent_status es; int retval; +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," @@ -531,6 +588,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG_ON(1); } +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(handle, inode, map, + &orig_map, flags); +#endif goto found; } @@ -551,6 +612,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -643,6 +713,15 @@ found: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (allocation)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -1768,6 +1847,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); +#ifdef ES_AGGRESSIVE_TEST + struct ext4_map_blocks orig_map; + + memcpy(&orig_map, map, sizeof(*map)); +#endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; @@ -1809,6 +1893,9 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, else BUG_ON(1); +#ifdef ES_AGGRESSIVE_TEST + ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); +#endif return retval; } @@ -1873,6 +1960,15 @@ add_delayed: int ret; unsigned long long status; +#ifdef ES_AGGRESSIVE_TEST + if (retval != map->m_len) { + printk("ES len assertation failed for inode: %lu " + "retval %d != map->m_len %d " + "in %s (lookup)\n", inode->i_ino, retval, + map->m_len, __func__); + } +#endif + status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3-70-g09d2 From cdee78433c138c2f2018a6884673739af2634787 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:08:52 -0400 Subject: ext4: fix wrong m_len value after unwritten extent conversion The ext4_ext_handle_uninitialized_extents() function was assuming the return value of ext4_ext_map_blocks() is equal to map->m_len. This incorrect assumption was harmless until we started use status tree as a extent cache because we need to update status tree according to 'm_len' value. Meanwhile this commit marks EXT4_MAP_MAPPED flag after unwritten extent conversion. It shouldn't cause a bug because we update status tree according to checking EXT4_MAP_UNWRITTEN flag. But it should be fixed. After applied this commit, the following error message from self-testing infrastructure disappears. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 25c86aaa38d..110e85a1f82 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3650,6 +3650,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ -- cgit v1.2.3-70-g09d2 From adb2355104b2109e06ba5276485d187d023b2fd2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:13:05 -0400 Subject: ext4: update extent status tree after an extent is zeroed out When we try to split an extent, this extent could be zeroed out and mark as initialized. But we don't know this in ext4_map_blocks because it only returns a length of allocated extent. Meanwhile we will mark this extent as uninitialized because we only check m_flags. This commit update extent status tree when we try to split an unwritten extent. We don't need to worry about the status of this extent because we always mark it as initialized. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 35 +++++++++++++++++++++++++++++++---- fs/ext4/extents_status.c | 17 +++++++++++++++++ fs/ext4/extents_status.h | 3 +++ fs/ext4/inode.c | 10 ++++++++++ 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 110e85a1f82..7e37018d175 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2925,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2996,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3009,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3150,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3247,6 +3268,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3305,6 +3329,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d2a8cb74676..fe3337a85ed 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -854,6 +854,23 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } +int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + ext4_lblk_t ee_block; + ext4_fsblk_t ee_pblock; + unsigned int ee_len; + + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext4_ext_pblock(ex); + + if (ee_len == 0) + return 0; + + return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); +} + static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct ext4_sb_info *sbi = container_of(shrink, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 56140ad4150..d8e2d4dc311 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -39,6 +39,8 @@ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE) +struct ext4_extent; + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ @@ -64,6 +66,7 @@ extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3186a43fa4b..4f1d54a88d8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -722,6 +722,15 @@ found: } #endif + /* + * If the extent has been zeroed out, we don't need to update + * extent status tree. + */ + if ((flags & EXT4_GET_BLOCKS_PRE_IO) && + ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es)) + goto has_zeroout; + } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && @@ -734,6 +743,7 @@ found: retval = ret; } +has_zeroout: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); -- cgit v1.2.3-70-g09d2 From 3a2256702e47f68f921dfad41b1764d05c572329 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:20:23 -0400 Subject: ext4: fix the wrong number of the allocated blocks in ext4_split_extent() This commit fixes a wrong return value of the number of the allocated blocks in ext4_split_extent. When the length of blocks we want to allocate is greater than the length of the current extent, we return a wrong number. Let's see what happens in the following case when we call ext4_split_extent(). map: [48, 72] ex: [32, 64, u] 'ex' will be split into two parts: ex1: [32, 47, u] ex2: [48, 64, w] 'map->m_len' is returned from this function, and the value is 24. But the real length is 16. So it should be fixed. Meanwhile in this commit we use right length of the allocated blocks when get_reserved_cluster_alloc in ext4_ext_handle_uninitialized_extents is called. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e37018d175..69df02ff96a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3067,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3086,6 +3087,8 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may @@ -3115,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3730,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already -- cgit v1.2.3-70-g09d2 From e1c36595bedc2e1b4112f01256cb30f4d9f9ae46 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 10 Mar 2013 22:19:00 -0400 Subject: ext4: fix WARN_ON from ext4_releasepage() ext4_releasepage() warns when it is passed a page with PageChecked set. However this can correctly happen when invalidate_inode_pages2_range() invalidates pages - and we should fail the release in that case. Since the page was dirty anyway, it won't be discarded and no harm has happened but it's good to be safe. Also remove bogus page_has_buffers() check - we are guaranteed page has buffers in this function. Reported-by: Zheng Liu Tested-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4f1d54a88d8..117a9e7aa4a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3018,8 +3018,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) trace_ext4_releasepage(page); - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) + /* Page has dirty journalled data -> cannot release */ + if (PageChecked(page)) return 0; if (journal) return jbd2_journal_try_to_free_buffers(journal, page, wait); -- cgit v1.2.3-70-g09d2 From e3d85c366089015805f175324bb1780249f44669 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:21:49 -0400 Subject: ext4: remove unused variable in ext4_free_blocks() Remove unused variable 'freed' in ext4_free_blocks(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7bb713a46fe..75e05f3a730 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4464,7 +4464,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; - unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; @@ -4672,8 +4671,6 @@ do_more: ext4_mb_unload_buddy(&e4b); - freed += count; - if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); -- cgit v1.2.3-70-g09d2 From bb8b20ed94bc69120e31399c43cb336300dea109 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:28:09 -0400 Subject: ext4: do not use yield() Using yield() is strongly discouraged (see sched/core.c) especially since we can just use cond_resched(). Replace all use of yield() with cond_resched(). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- fs/ext4/mballoc.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 117a9e7aa4a..48fc023ab0a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1352,7 +1352,7 @@ repeat: ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - yield(); + cond_resched(); goto repeat; } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 75e05f3a730..8b2ea9f7500 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3692,11 +3692,7 @@ repeat: if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); - /* - * Yield the CPU here so that we don't get soft lockup - * in non preempt case. - */ - yield(); + cond_resched(); goto repeat; } @@ -4246,7 +4242,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ - yield(); + cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { -- cgit v1.2.3-70-g09d2 From 232ec8720d4e45405e37144c67053042c6b886d3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:46:30 -0400 Subject: ext4: update reserved space after the 'correction' Currently in ext4_ext_map_blocks() in delayed allocation writeback we would update the reservation and after that check whether we claimed cluster outside of the range of the allocation and if so, we'll give the block back to the reservation pool. However this also means that if the number of reserved data block dropped to zero before the correction, we would release all the metadata reservation as well, however we might still need it because the we're not done with the delayed allocation and there might be more blocks to come. This will result in error messages such as: EXT4-fs warning (device sdb): ext4_da_update_reserve_space:361: ino 12, allocated 1 with only 0 reserved metadata blocks (releasing 1 blocks with reserved 1 data blocks) This will only happen on bigalloc file system and it can be easily reproduced using fiemap-tester from xfstests like this: ./src/fiemap-tester -m DHDHDHDHD -S -p0 /mnt/test/file Or using xfstests such as 225. Fix this by doing the correction first and updating the reservation after that so that we do not accidentally decrease i_reserved_data_blocks to zero. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 69df02ff96a..bd69e906bd9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4165,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4218,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } -- cgit v1.2.3-70-g09d2 From 386ad67c9ac043890121c066186883d1640348a4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:50:00 -0400 Subject: ext4: reserve metadata block for every delayed write Currently we only reserve space (data+metadata) in delayed allocation if we're allocating from new cluster (which is always in non-bigalloc file system) which is ok for data blocks, because we reserve the whole cluster. However we have to reserve metadata for every delayed block we're going to write because every block could potentially require metedata block when we need to grow the extent tree. Signed-off-by: Lukas Czerner --- fs/ext4/inode.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 48fc023ab0a..65bbc9339ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1304,6 +1304,55 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } +/* + * Reserve a metadata for a single block located at lblock + */ +static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) +{ + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int md_needed; + ext4_lblk_t save_last_lblock; + int save_len; + + /* + * recalculate the amount of metadata blocks to reserve + * in order to allocate nrblocks + * worse case is one extent per block + */ +repeat: + spin_lock(&ei->i_block_reservation_lock); + /* + * ext4_calc_metadata_amount() has side effects, which we have + * to be prepared undo if we fail to claim space. + */ + save_len = ei->i_da_metadata_calc_len; + save_last_lblock = ei->i_da_metadata_calc_last_lblock; + md_needed = EXT4_NUM_B2C(sbi, + ext4_calc_metadata_amount(inode, lblock)); + trace_ext4_da_reserve_space(inode, md_needed); + + /* + * We do still charge estimated metadata to the sb though; + * we cannot afford to run out of free blocks. + */ + if (ext4_claim_free_clusters(sbi, md_needed, 0)) { + ei->i_da_metadata_calc_len = save_len; + ei->i_da_metadata_calc_last_lblock = save_last_lblock; + spin_unlock(&ei->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + cond_resched(); + goto repeat; + } + return -ENOSPC; + } + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ +} + /* * Reserve a single cluster located at lblock */ @@ -1940,8 +1989,11 @@ add_delayed: * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - /* If the block was allocated from previously allocated cluster, - * then we dont need to reserve it again. */ + /* + * If the block was allocated from previously allocated cluster, + * then we don't need to reserve it again. However we still need + * to reserve metadata for every block we're going to write. + */ if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { @@ -1949,6 +2001,13 @@ add_delayed: retval = ret; goto out_unlock; } + } else { + ret = ext4_da_reserve_metadata(inode, iblock); + if (ret) { + /* not enough space to reserve */ + retval = ret; + goto out_unlock; + } } ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, -- cgit v1.2.3-70-g09d2 From ad56edad089b56300fd13bb9eeb7d0424d978239 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2013 13:24:56 -0400 Subject: jbd2: fix use after free in jbd2_journal_dirty_metadata() jbd2_journal_dirty_metadata() didn't get a reference to journal_head it was working with. This is OK in most of the cases since the journal head should be attached to a transaction but in rare occasions when we are journalling data, __ext4_journalled_writepage() can race with jbd2_journal_invalidatepage() stripping buffers from a page and thus journal head can be freed under hands of jbd2_journal_dirty_metadata(). Fix the problem by getting own journal head reference in jbd2_journal_dirty_metadata() (and also in jbd2_journal_set_triggers() which can possibly have the same issue). Reported-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/jbd2/transaction.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6ee5aed56b..325bc019ed8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1065,9 +1065,12 @@ out: void jbd2_journal_set_triggers(struct buffer_head *bh, struct jbd2_buffer_trigger_type *type) { - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh = jbd2_journal_grab_journal_head(bh); + if (WARN_ON(!jh)) + return; jh->b_triggers = type; + jbd2_journal_put_journal_head(jh); } void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, @@ -1119,17 +1122,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - struct journal_head *jh = bh2jh(bh); + struct journal_head *jh; int ret = 0; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); if (is_handle_aborted(handle)) goto out; - if (!buffer_jbd(bh)) { + jh = jbd2_journal_grab_journal_head(bh); + if (!jh) { ret = -EUCLEAN; goto out; } + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); jbd_lock_bh_state(bh); @@ -1220,6 +1224,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); + jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); WARN_ON(ret); /* All errors are bugs, so dump the stack */ -- cgit v1.2.3-70-g09d2 From 90ba983f6889e65a3b506b30dc606aa9d1d46cd2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 11 Mar 2013 23:39:59 -0400 Subject: ext4: use atomic64_t for the per-flexbg free_clusters count A user who was using a 8TB+ file system and with a very large flexbg size (> 65536) could cause the atomic_t used in the struct flex_groups to overflow. This was detected by PaX security patchset: http://forums.grsecurity.net/viewtopic.php?f=3&t=3289&p=12551#p12551 This bug was introduced in commit 9f24e4208f7e, so it's been around since 2.6.30. :-( Fix this by using an atomic64_t for struct orlav_stats's free_clusters. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 6 +++--- fs/ext4/ialloc.c | 4 ++-- fs/ext4/mballoc.c | 12 ++++++------ fs/ext4/resize.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a01ba31526..167ff564bbf 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -335,9 +335,9 @@ struct ext4_group_desc */ struct flex_groups { - atomic_t free_inodes; - atomic_t free_clusters; - atomic_t used_dirs; + atomic64_t free_clusters; + atomic_t free_inodes; + atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 32fd2b9075d..6c5bb8d993f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -324,8 +324,8 @@ error_return: } struct orlov_stats { + __u64 free_clusters; __u32 free_inodes; - __u32 free_clusters; __u32 used_dirs; }; @@ -342,7 +342,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, if (flex_size > 1) { stats->free_inodes = atomic_read(&flex_group[g].free_inodes); - stats->free_clusters = atomic_read(&flex_group[g].free_clusters); + stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); stats->used_dirs = atomic_read(&flex_group[g].used_dirs); return; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b2ea9f7500..ee6614bdb63 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2804,8 +2804,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); - atomic_sub(ac->ac_b_ex.fe_len, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_sub(ac->ac_b_ex.fe_len, + &sbi->s_flex_groups[flex_group].free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -4661,8 +4661,8 @@ do_more: if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(count_clusters, - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(count_clusters, + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); @@ -4804,8 +4804,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic_add(EXT4_NUM_B2C(sbi, blocks_freed), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed), + &sbi->s_flex_groups[flex_group].free_clusters); } ext4_mb_unload_buddy(&e4b); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b2c8ee56eb9..c169477a62c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1360,8 +1360,8 @@ static void ext4_update_super(struct super_block *sb, sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); - atomic_add(EXT4_NUM_B2C(sbi, free_blocks), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, &sbi->s_flex_groups[flex_group].free_inodes); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9379b7fbfd9..d1ee6a84338 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1923,8 +1923,8 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group = ext4_flex_group(sbi, i); atomic_add(ext4_free_inodes_count(sb, gdp), &sbi->s_flex_groups[flex_group].free_inodes); - atomic_add(ext4_free_group_clusters(sb, gdp), - &sbi->s_flex_groups[flex_group].free_clusters); + atomic64_add(ext4_free_group_clusters(sb, gdp), + &sbi->s_flex_groups[flex_group].free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &sbi->s_flex_groups[flex_group].used_dirs); } -- cgit v1.2.3-70-g09d2 From 4f42f80a8f08d4c3f52c4267361241885d5dee3a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 12 Mar 2013 12:40:04 -0400 Subject: ext4: use s_extent_max_zeroout_kb value as number of kb Currently when converting extent to initialized, we have to decide whether to zeroout part/all of the uninitialized extent in order to avoid extent tree growing rapidly. The decision is made by comparing the size of the extent with the configurable value s_extent_max_zeroout_kb which is in kibibytes units. However when converting it to number of blocks we currently use it as it was in bytes. This is obviously bug and it will result in ext4 _never_ zeroout extents, but rather always split and convert parts to initialized while leaving the rest uninitialized in default setting. Fix this by using s_extent_max_zeroout_kb as kibibytes. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd69e906bd9..e2bb929bea9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3264,7 +3264,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { -- cgit v1.2.3-70-g09d2 From 47c78f4a70d791ff44cab3254b489605a52e3181 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Mon, 11 Mar 2013 13:08:49 +0000 Subject: cifs: map NT_STATUS_SHARING_VIOLATION to EBUSY instead of ETXTBSY NT_SHARING_VIOLATION errors are mapped to ETXTBSY which is unexpected for operations such as unlink where we can hit these errors. The patch maps the error NT_SHARING_VIOLATION to EBUSY instead. The patch also replaces all instances of ETXTBSY in cifs_rename_pending_delete() with EBUSY. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 10 ++++------ fs/cifs/netmisc.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0079696305c..20887bf6312 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1043,7 +1043,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_setattr; } @@ -1062,7 +1062,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, if (rc == -ENOENT) rc = 0; else if (rc != 0) { - rc = -ETXTBSY; + rc = -EBUSY; goto undo_rename; } cifsInode->delete_pending = true; @@ -1169,15 +1169,13 @@ psx_del_no_retry: cifs_drop_nlink(inode); } else if (rc == -ENOENT) { d_drop(dentry); - } else if (rc == -ETXTBSY) { + } else if (rc == -EBUSY) { if (server->ops->rename_pending_delete) { rc = server->ops->rename_pending_delete(full_path, dentry, xid); if (rc == 0) cifs_drop_nlink(inode); } - if (rc == -ETXTBSY) - rc = -EBUSY; } else if ((rc == -EACCES) && (dosattr == 0) && inode) { attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (attrs == NULL) { @@ -1518,7 +1516,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, * source. Note that cross directory moves do not work with * rename by filehandle to various Windows servers. */ - if (rc == 0 || rc != -ETXTBSY) + if (rc == 0 || rc != -EBUSY) goto do_rename_exit; /* open-file renames don't work across directories */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index a82bc51fdc8..c0b25b28be6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -62,7 +62,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { {ERRdiffdevice, -EXDEV}, {ERRnofiles, -ENOENT}, {ERRwriteprot, -EROFS}, - {ERRbadshare, -ETXTBSY}, + {ERRbadshare, -EBUSY}, {ERRlock, -EACCES}, {ERRunsup, -EINVAL}, {ERRnosuchshare, -ENXIO}, -- cgit v1.2.3-70-g09d2 From 24261fc23db950951760d00c188ba63cc756b932 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 8 Mar 2013 16:30:03 +0100 Subject: cifs: delay super block destruction until all cifsFileInfo objects are gone cifsFileInfo objects hold references to dentries and it is possible that these will still be around in workqueues when VFS decides to kill super block during unmount. This results in panics like this one: BUG: Dentry ffff88001f5e76c0{i=66b4a,n=1M-2} still in use (1) [unmount of cifs cifs] ------------[ cut here ]------------ kernel BUG at fs/dcache.c:943! [..] Process umount (pid: 1781, threadinfo ffff88003d6e8000, task ffff880035eeaec0) [..] Call Trace: [] shrink_dcache_for_umount+0x33/0x60 [] generic_shutdown_super+0x2c/0xe0 [] kill_anon_super+0x16/0x30 [] cifs_kill_sb+0x1a/0x30 [cifs] [] deactivate_locked_super+0x57/0x80 [] deactivate_super+0x4e/0x70 [] mntput_no_expire+0xd7/0x130 [] sys_umount+0x9c/0x3c0 [] system_call_fastpath+0x16/0x1b Fix this by making each cifsFileInfo object hold a reference to cifs super block, which implicitly keeps VFS super block around as well. Signed-off-by: Mateusz Guzik Reviewed-by: Jeff Layton Cc: Reported-and-Tested-by: Ben Greear Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 24 ++++++++++++++++++++++++ fs/cifs/cifsfs.h | 4 ++++ fs/cifs/file.c | 6 +++++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1a052c0eee8..054b90b682a 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -91,6 +91,30 @@ struct workqueue_struct *cifsiod_wq; __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE]; #endif +/* + * Bumps refcount for cifs super block. + * Note that it should be only called if a referece to VFS super block is + * already held, e.g. in open-type syscalls context. Otherwise it can race with + * atomic_dec_and_test in deactivate_locked_super. + */ +void +cifs_sb_active(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); +} + +void +cifs_sb_deactive(struct super_block *sb) +{ + struct cifs_sb_info *server = CIFS_SB(sb); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); +} + static int cifs_read_super(struct super_block *sb) { diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7163419cecd..0e32c3446ce 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type; extern const struct address_space_operations cifs_addr_ops; extern const struct address_space_operations cifs_addr_ops_smallbuf; +/* Functions related to super block operations */ +extern void cifs_sb_active(struct super_block *sb); +extern void cifs_sb_deactive(struct super_block *sb); + /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c0d8557731..7a0dd99e450 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + cifs_sb_active(inode->i_sb); + /* * If the server returned a read oplock and we have mandatory brlocks, * set oplock level to None. @@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; @@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) cifs_put_tlink(cifs_file->tlink); dput(cifs_file->dentry); + cifs_sb_deactive(sb); kfree(cifs_file); } -- cgit v1.2.3-70-g09d2 From 67e753ca41782913d805ff4a8a2b0f60b26b7915 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 14 Mar 2013 10:49:23 +0200 Subject: UBIFS: make space fixup work in the remount case The UBIFS space fixup is a useful feature which allows to fixup the "broken" flash space at the time of the first mount. The "broken" space is usually the result of using a "dumb" industrial flasher which is not able to skip empty NAND pages and just writes all 0xFFs to the empty space, which has grave side-effects for UBIFS when UBIFS trise to write useful data to those empty pages. The fix-up feature works roughly like this: 1. mkfs.ubifs sets the fixup flag in UBIFS superblock when creating the image (see -F option) 2. when the file-system is mounted for the first time, UBIFS notices the fixup flag and re-writes the entire media atomically, which may take really a lot of time. 3. UBIFS clears the fixup flag in the superblock. This works fine when the file system is mounted R/W for the very first time. But it did not really work in the case when we first mount the file-system R/O, and then re-mount R/W. The reason was that we started the fixup procedure too late, which we cannot really do because we have to fixup the space before it starts being used. Signed-off-by: Artem Bityutskiy Reported-by: Mark Jackson Cc: stable@vger.kernel.org # 3.0+ --- fs/ubifs/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ac838b84493..f21acf0ef01 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1568,6 +1568,12 @@ static int ubifs_remount_rw(struct ubifs_info *c) c->remounting_rw = 1; c->ro_mount = 0; + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + return err; + } + err = check_free_space(c); if (err) goto out; @@ -1684,12 +1690,6 @@ static int ubifs_remount_rw(struct ubifs_info *c) err = dbg_check_space_info(c); } - if (c->space_fixup) { - err = ubifs_fixup_free_space(c); - if (err) - goto out; - } - mutex_unlock(&c->umount_mutex); return err; -- cgit v1.2.3-70-g09d2 From 0e401101db49959f5783f6ee9e676124b5a183ac Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 18 Mar 2013 11:40:19 -0400 Subject: ext4: fix memory leakage in mext_check_coverage Regression was introduced by following commit 8c854473 TESTCASE (git://oss.sgi.com/xfs/cmds/xfstests.git): #while true;do ./check 301 || break ;done Also fix potential memory leakage in get_ext_path() once ext4_ext_find_extent() have failed. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c1f15b203e9..bbae4ed15c3 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,16 +32,18 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **path) + struct ext4_ext_path **orig_path) { int ret = 0; + struct ext4_ext_path *path; - *path = ext4_ext_find_extent(inode, lblock, *path); - if (IS_ERR(*path)) { - ret = PTR_ERR(*path); - *path = NULL; - } else if ((*path)[ext_depth(inode)].p_ext == NULL) + path = ext4_ext_find_extent(inode, lblock, *orig_path); + if (IS_ERR(path)) + ret = PTR_ERR(path); + else if (path[ext_depth(inode)].p_ext == NULL) ret = -ENODATA; + else + *orig_path = path; return ret; } @@ -611,24 +613,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; + int ret = 0; ext4_lblk_t last = from + count; while (from < last) { *err = get_ext_path(inode, from, &path); if (*err) - return 0; + goto out; ext = path[ext_depth(inode)].p_ext; - if (!ext) { - ext4_ext_drop_refs(path); - return 0; - } - if (uninit != ext4_ext_is_uninitialized(ext)) { - ext4_ext_drop_refs(path); - return 0; - } + if (uninit != ext4_ext_is_uninitialized(ext)) + goto out; from += ext4_ext_get_actual_len(ext); ext4_ext_drop_refs(path); } - return 1; + ret = 1; +out: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + return ret; } /** -- cgit v1.2.3-70-g09d2 From 83cdadd8b0559c93728d065d23ca3485fa567e54 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 22 Feb 2013 13:32:56 -0500 Subject: xfs: fix potential infinite loop in xfs_iomap_prealloc_size() If freesp == 0, we could end up in an infinite loop while squashing the preallocation. Break the loop when we've killed the prealloc entirely. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e78c420bfc2608bb5f9a0b9165b1071c1e31166a) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 912d83d8860..b0b0f448e84 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -413,7 +413,7 @@ xfs_iomap_prealloc_size( * have a large file on a small filesystem and the above * lowspace thresholds are smaller than MAXEXTLEN. */ - while (alloc_blocks >= freesp) + while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; } -- cgit v1.2.3-70-g09d2 From 3325beed46d8d14d873e94d89ea57ee900dec942 Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Sun, 24 Feb 2013 13:04:37 -0600 Subject: xfs: fix xfs_iomap_eof_prealloc_initial_size type Fix the return type of xfs_iomap_eof_prealloc_initial_size() to xfs_fsblock_t to reflect the fact that the return value may be an unsigned 64 bits if XFS_BIG_BLKNOS is defined. Signed-off-by: Mark Tinguely Reviewed-by: Dave Chinner Signed-off-by: Ben Myers (cherry picked from commit e8108cedb1c5d1dc359690d18ca997e97a0061d2) --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index b0b0f448e84..5a30dd899d2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -325,7 +325,7 @@ xfs_iomap_eof_want_preallocate( * rather than falling short due to things like stripe unit/width alignment of * real extents. */ -STATIC int +STATIC xfs_fsblock_t xfs_iomap_eof_prealloc_initial_size( struct xfs_mount *mp, struct xfs_inode *ip, -- cgit v1.2.3-70-g09d2 From e001873853d87674dd5b3cfa2851885023616695 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Mar 2013 23:30:34 +1100 Subject: xfs: ensure we capture IO errors correctly Failed buffer readahead can leave the buffer in the cache marked with an error. Most callers that then issue a subsequent read on the buffer do not zero the b_error field out, and so we may incorectly detect an error during IO completion due to the stale error value left on the buffer. Avoid this problem by zeroing the error before IO submission. This ensures that the only IO errors that are detected those captured from are those captured from bio submission or completion. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit c163f9a1760229a95d04e37b332de7d5c1c225cd) --- fs/xfs/xfs_buf.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4e8f0df82d0..8459b5d8cb7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1334,6 +1334,12 @@ _xfs_buf_ioapply( int size; int i; + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; -- cgit v1.2.3-70-g09d2 From a517b608fa3d9b65930ef53ffe4a2f9800e10f7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 18 Mar 2013 10:49:07 -0400 Subject: nfsd: only unhash DRC entries that are in the hashtable It's not safe to call hlist_del() on a newly initialized hlist_node. That leads to a NULL pointer dereference. Only do that if the entry is hashed. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 62c1ee128ae..18509bd6f58 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -102,7 +102,8 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { if (rp->c_type == RC_REPLBUFF) kfree(rp->c_replvec.iov_base); - hlist_del(&rp->c_hash); + if (!hlist_unhashed(&rp->c_hash)) + hlist_del(&rp->c_hash); list_del(&rp->c_lru); --num_drc_entries; kmem_cache_free(drc_slab, rp); -- cgit v1.2.3-70-g09d2 From ac534ff2d5508bdff1358a55d88053da729ff46b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 15 Mar 2013 09:16:29 -0400 Subject: nfsd: fix startup order in nfsd_reply_cache_init If we end up doing "goto out_nomem" in this function, we'll call nfsd_reply_cache_shutdown. That will attempt to walk the LRU list and free entries, but that list may not be initialized yet if the server is starting up for the first time. It's also possible for the shrinker to kick in before we've initialized the LRU list. Rearrange the initialization so that the LRU list_head and cache size are initialized before doing any of the allocations that might fail. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 18509bd6f58..ca05f6dc354 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -119,6 +119,10 @@ nfsd_reply_cache_free(struct svc_cacherep *rp) int nfsd_reply_cache_init(void) { + INIT_LIST_HEAD(&lru_head); + max_drc_entries = nfsd_cache_size_limit(); + num_drc_entries = 0; + register_shrinker(&nfsd_reply_cache_shrinker); drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); @@ -129,10 +133,6 @@ int nfsd_reply_cache_init(void) if (!cache_hash) goto out_nomem; - INIT_LIST_HEAD(&lru_head); - max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; - return 0; out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); -- cgit v1.2.3-70-g09d2 From 1ada47d9468fe3907f7f9e00179168f5e2f90803 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:39:42 -0400 Subject: ext4: fix ext4_evict_inode() racing against workqueue processing code Commit 84c17543ab56 (ext4: move work from io_end to inode) triggered a regression when running xfstest #270 when the file system is mounted with dioread_nolock. The problem is that after ext4_evict_inode() calls ext4_ioend_wait(), this guarantees that last io_end structure has been freed, but it does not guarantee that the workqueue structure, which was moved into the inode by commit 84c17543ab56, is actually finished. Once ext4_flush_completed_IO() calls ext4_free_io_end() on CPU #1, this will allow ext4_ioend_wait() to return on CPU #2, at which point the evict_inode() codepath can race against the workqueue code on CPU #1 accessing EXT4_I(inode)->i_unwritten_work to find the next item of work to do. Fix this by calling cancel_work_sync() in ext4_ioend_wait(), which will be renamed ext4_ioend_shutdown(), since it is only used by ext4_evict_inode(). Also, move the call to ext4_ioend_shutdown() until after truncate_inode_pages() and filemap_write_and_wait() are called, to make sure all dirty pages have been written back and flushed from the page cache first. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cwq_activate_delayed_work+0x3b/0x7e *pdpt = 0000000030bc3001 *pde = 0000000000000000 Oops: 0000 [#1] SMP DEBUG_PAGEALLOC Modules linked in: Pid: 6, comm: kworker/u:0 Not tainted 3.8.0-rc3-00013-g84c1754-dirty #91 Bochs Bochs EIP: 0060:[] EFLAGS: 00010046 CPU: 0 EIP is at cwq_activate_delayed_work+0x3b/0x7e EAX: 00000000 EBX: 00000000 ECX: f505fe54 EDX: 00000000 ESI: ed5b697c EDI: 00000006 EBP: f64b7e8c ESP: f64b7e84 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: 00000000 CR3: 30bc2000 CR4: 000006f0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process kworker/u:0 (pid: 6, ti=f64b6000 task=f64b4160 task.ti=f64b6000) Stack: f505fe00 00000006 f64b7e9c c01de3d7 f6435540 00000003 f64b7efc c01def1d f6435540 00000002 00000000 0000008a c16d0808 c040a10b c16d07d8 c16d08b0 f505fe00 c16d0780 00000000 00000000 ee153df4 c1ce4a30 c17d0e30 00000000 Call Trace: [] cwq_dec_nr_in_flight+0x71/0xfb [] process_one_work+0x5d8/0x637 [] ? ext4_end_bio+0x300/0x300 [] worker_thread+0x249/0x3ef [] kthread+0xd8/0xeb [] ? manage_workers+0x4bb/0x4bb [] ? trace_hardirqs_on+0x27/0x37 [] ret_from_kernel_thread+0x1b/0x28 [] ? __init_kthread_worker+0x71/0x71 Code: 01 83 15 ac ff 6c c1 00 31 db 89 c6 8b 00 a8 04 74 12 89 c3 30 db 83 05 b0 ff 6c c1 01 83 15 b4 ff 6c c1 00 89 f0 e8 42 ff ff ff <8b> 13 89 f0 83 05 b8 ff 6c c1 6c c1 00 31 c9 83 EIP: [] cwq_activate_delayed_work+0x3b/0x7e SS:ESP 0068:f64b7e84 CR2: 0000000000000000 ---[ end trace a1923229da53d8a4 ]--- Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/page-io.c | 12 +++++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 167ff564bbf..3b83cd60479 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2617,7 +2617,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, extern int __init ext4_init_pageio(void); extern void ext4_add_complete_io(ext4_io_end_t *io_end); extern void ext4_exit_pageio(void); -extern void ext4_ioend_wait(struct inode *); +extern void ext4_ioend_shutdown(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern void ext4_end_io_work(struct work_struct *work); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 65bbc9339ac..ea5f24ffa60 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -185,8 +185,6 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); - ext4_ioend_wait(inode); - if (inode->i_nlink) { /* * When journalling data dirty buffers are tracked only in the @@ -216,6 +214,7 @@ void ext4_evict_inode(struct inode *inode) filemap_write_and_wait(&inode->i_data); } truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); goto no_delete; } @@ -225,6 +224,7 @@ void ext4_evict_inode(struct inode *inode) if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages(&inode->i_data, 0); + ext4_ioend_shutdown(inode); if (is_bad_inode(inode)) goto no_delete; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 809b31003ec..047a6de04a0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -50,11 +50,21 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } -void ext4_ioend_wait(struct inode *inode) +/* + * This function is called by ext4_evict_inode() to make sure there is + * no more pending I/O completion work left to do. + */ +void ext4_ioend_shutdown(struct inode *inode) { wait_queue_head_t *wq = ext4_ioend_wq(inode); wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); + /* + * We need to make sure the work structure is finished being + * used before we let the inode get destroyed. + */ + if (work_pending(&EXT4_I(inode)->i_unwritten_work)) + cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); } static void put_io_page(struct ext4_io_page *io_page) -- cgit v1.2.3-70-g09d2 From 2b405bfa84063bfa35621d2d6879f52693c614b0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 20 Mar 2013 09:42:11 -0400 Subject: ext4: fix data=journal fast mount/umount hang In data=journal mode, if we unmount the file system before a transaction has a chance to complete, when the journal inode is being evicted, we can end up calling into jbd2_log_wait_commit() for the last transaction, after the journalling machinery has been shut down. Arguably we should adjust ext4_should_journal_data() to return FALSE for the journal inode, but the only place it matters is ext4_evict_inode(), and so to save a bit of CPU time, and to make the patch much more obviously correct by inspection(tm), we'll fix it by explicitly not trying to waiting for a journal commit when we are evicting the journal inode, since it's guaranteed to never succeed in this case. This can be easily replicated via: mount -t ext4 -o data=journal /dev/vdb /vdb ; umount /vdb ------------[ cut here ]------------ WARNING: at /usr/projects/linux/ext4/fs/jbd2/journal.c:542 __jbd2_log_start_commit+0xba/0xcd() Hardware name: Bochs JBD2: bad log_start_commit: 3005630206 3005630206 0 0 Modules linked in: Pid: 2909, comm: umount Not tainted 3.8.0-rc3 #1020 Call Trace: [] warn_slowpath_common+0x68/0x7d [] ? __jbd2_log_start_commit+0xba/0xcd [] warn_slowpath_fmt+0x2b/0x2f [] __jbd2_log_start_commit+0xba/0xcd [] jbd2_log_start_commit+0x24/0x34 [] ext4_evict_inode+0x71/0x2e3 [] evict+0x94/0x135 [] iput+0x10a/0x110 [] jbd2_journal_destroy+0x190/0x1ce [] ? bit_waitqueue+0x50/0x50 [] ext4_put_super+0x52/0x294 [] generic_shutdown_super+0x48/0xb4 [] kill_block_super+0x22/0x60 [] deactivate_locked_super+0x22/0x49 [] deactivate_super+0x30/0x33 [] mntput_no_expire+0x107/0x10c [] sys_umount+0x2cf/0x2e0 [] sys_oldumount+0x12/0x14 [] syscall_call+0x7/0xb ---[ end trace 6a954cc790501c1f ]--- jbd2_log_wait_commit: error: j_commit_request=-1289337090, tid=0 Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ea5f24ffa60..85e41a2a39a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -205,7 +205,8 @@ void ext4_evict_inode(struct inode *inode) * don't use page cache. */ if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT4_JOURNAL_INO) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; -- cgit v1.2.3-70-g09d2 From cf4ab538f1516606d3ae730dce15d6f33d96b7e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Mar 2013 12:56:37 -0500 Subject: NFSv4: Fix the string length returned by the idmapper Functions like nfs_map_uid_to_name() and nfs_map_gid_to_group() are expected to return a string without any terminating NUL character. Regression introduced by commit 57e62324e469e092ecc6c94a7a86fe4bd6ac5172 (NFS: Store the legacy idmapper result in the keyring). Reported-by: Dave Chiluk Signed-off-by: Trond Myklebust Cc: Bryan Schumaker Cc: stable@vger.kernel.org [>=3.4] --- fs/nfs/idmap.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index dc0f98dfa71..c516da5873f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -726,9 +726,9 @@ out1: return ret; } -static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data) +static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { - return key_instantiate_and_link(key, data, strlen(data) + 1, + return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } @@ -738,6 +738,7 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; + size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ @@ -747,13 +748,15 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; - sprintf(id_str, "%d", im->im_id); - ret = nfs_idmap_instantiate(key, authkey, id_str); + /* Note: here we store the NUL terminator too */ + len = sprintf(id_str, "%d", im->im_id) + 1; + ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; - ret = nfs_idmap_instantiate(key, authkey, im->im_name); + len = strlen(im->im_name); + ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; -- cgit v1.2.3-70-g09d2 From 991f76f837bf22c5bb07261cfd86525a0a96650c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:24 +0800 Subject: sysfs: fix race between readdir and lseek While readdir() is running, lseek() may set filp->f_pos as zero, then may leave filp->private_data pointing to one sysfs_dirent object without holding its reference counter, so the sysfs_dirent object may be used after free in next readdir(). This patch holds inode->i_mutex to avoid the problem since the lock is always held in readdir path. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25..c9e16608f48 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1058,10 +1058,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; -- cgit v1.2.3-70-g09d2 From e5110f411d2ee35bf8d202ccca2e89c633060dca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:25 +0800 Subject: sysfs: handle failure path correctly for readdir() In case of 'if (filp->f_pos == 0 or 1)' of sysfs_readdir(), the failure from filldir() isn't handled, and the reference counter of the sysfs_dirent object pointed by filp->private_data will be released without clearing filp->private_data, so use after free bug will be triggered later. This patch returns immeadiately under the situation for fixing the bug, and it is reasonable to return from readdir() when filldir() fails. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c9e16608f48..e14512678c9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); -- cgit v1.2.3-70-g09d2 From 4376c94618c26225e69e17b7c91169c45a90b292 Mon Sep 17 00:00:00 2001 From: fanchaoting Date: Thu, 21 Mar 2013 09:15:30 +0800 Subject: pnfs-block: removing DM device maybe cause oops when call dev_remove when pnfs block using device mapper,if umounting later,it maybe cause oops. we apply "1 + sizeof(bl_umount_request)" memory for msg->data, the memory maybe overflow when we do "memcpy(&dataptr [sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request))", because the size of bl_msg is more than 1 byte. Signed-off-by: fanchaoting Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayoutdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c index 737d839bc17..6fc7b5cae92 100644 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ b/fs/nfs/blocklayout/blocklayoutdm.c @@ -55,7 +55,8 @@ static void dev_remove(struct net *net, dev_t dev) bl_pipe_msg.bl_wq = &nn->bl_wq; memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(1 + sizeof(bl_umount_request), GFP_NOFS); + msg->len = sizeof(bl_msg) + bl_msg.totallen; + msg->data = kzalloc(msg->len, GFP_NOFS); if (!msg->data) goto out; @@ -66,7 +67,6 @@ static void dev_remove(struct net *net, dev_t dev) memcpy(msg->data, &bl_msg, sizeof(bl_msg)); dataptr = (uint8_t *) msg->data; memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; add_wait_queue(&nn->bl_wq, &wq); if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { -- cgit v1.2.3-70-g09d2 From a073dbff359f4741013ae4b8395f5364c5e00b48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 12:34:32 -0400 Subject: NFSv4.1: Fix a race in pNFS layoutcommit We need to clear the NFS_LSEG_LAYOUTCOMMIT bits atomically with the NFS_INO_LAYOUTCOMMIT bit, otherwise we may end up with situations where the two are out of sync. The first half of the problem is to ensure that pnfs_layoutcommit_inode clears the NFS_LSEG_LAYOUTCOMMIT bit through pnfs_list_write_lseg. We still need to keep the reference to those segments until the RPC call is finished, so in order to make it clear _where_ those references come from, we add a helper pnfs_list_write_lseg_done() that cleans up after pnfs_list_write_lseg. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 14 -------------- fs/nfs/pnfs.c | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b2671cb0f90..6ccdd4fd9b5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6416,22 +6416,8 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; - struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); - /* Matched by references in pnfs_set_layoutcommit */ - list_for_each_entry_safe(lseg, tmp, &data->lseg_list, pls_lc_list) { - list_del_init(&lseg->pls_lc_list); - if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, - &lseg->pls_flags)) - pnfs_put_lseg(lseg); - } - - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); - put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 48ac5aad625..3d900916fd4 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1746,11 +1746,27 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { if (lseg->pls_range.iomode == IOMODE_RW && - test_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) list_add(&lseg->pls_lc_list, listp); } } +static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) +{ + struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(inode)->flags; + + /* Matched by references in pnfs_set_layoutcommit */ + list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { + list_del_init(&lseg->pls_lc_list); + pnfs_put_lseg(lseg); + } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) { pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); @@ -1795,6 +1811,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) if (nfss->pnfs_curr_ld->cleanup_layoutcommit) nfss->pnfs_curr_ld->cleanup_layoutcommit(data); + pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); } /* -- cgit v1.2.3-70-g09d2 From 24956804349ca0eadcdde032d65e8c00b4214096 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:03:00 -0400 Subject: NFSv4.1: Always clear the NFS_INO_LAYOUTCOMMIT in layoutreturn Note that clearing NFS_INO_LAYOUTCOMMIT is tricky, since it requires you to also clear the NFS_LSEG_LAYOUTCOMMIT bits from the layout segments. The only two sites that need to do this are the ones that call pnfs_return_layout() without first doing a layout commit. Signed-off-by: Trond Myklebust Acked-by: Benny Halevy Cc: stable@vger.kernel.org --- fs/nfs/nfs4filelayout.c | 1 - fs/nfs/pnfs.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 49eeb044c10..4fb234d3aef 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -129,7 +129,6 @@ static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) { if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return; - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags); pnfs_return_layout(inode); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3d900916fd4..5044142c121 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -417,6 +417,16 @@ should_free_lseg(struct pnfs_layout_range *lseg_range, lo_seg_intersecting(lseg_range, recall_range); } +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list) +{ + if (!atomic_dec_and_test(&lseg->pls_refcount)) + return false; + pnfs_layout_remove_lseg(lseg->pls_layout, lseg); + list_add(&lseg->pls_list, tmp_list); + return true; +} + /* Returns 1 if lseg is removed from list, 0 otherwise */ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) @@ -430,11 +440,8 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - pnfs_layout_remove_lseg(lseg->pls_layout, lseg); - list_add(&lseg->pls_list, tmp_list); + if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; - } } return rv; } @@ -777,6 +784,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, return lseg; } +static void pnfs_clear_layoutcommit(struct inode *inode, + struct list_head *head) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg, *tmp; + + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return; + list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { + if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + continue; + pnfs_lseg_dec_and_remove_zero(lseg, head); + } +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -808,6 +830,7 @@ _pnfs_return_layout(struct inode *ino) /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); + pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { @@ -820,8 +843,6 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; @@ -1458,7 +1479,6 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data) dprintk("pnfs write error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) @@ -1613,7 +1633,6 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data) dprintk("pnfs read error = %d\n", hdr->pnfs_error); if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & PNFS_LAYOUTRET_ON_ERROR) { - clear_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(hdr->inode)->flags); pnfs_return_layout(hdr->inode); } if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) -- cgit v1.2.3-70-g09d2 From 240286725d854331422cb15957f8d9bf2741d4e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 20 Mar 2013 13:23:33 -0400 Subject: NFSv4.1: Add a helper pnfs_commit_and_return_layout In order to be able to safely return the layout in nfs4_proc_setattr, we need to block new uses of the layout, wait for all outstanding users of the layout to complete, commit the layout and then return it. This patch adds a helper in order to do all this safely. Signed-off-by: Trond Myklebust Cc: Boaz Harrosh --- fs/nfs/nfs4proc.c | 2 +- fs/nfs/pnfs.c | 27 +++++++++++++++++++++++++++ fs/nfs/pnfs.h | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6ccdd4fd9b5..26431cf62dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2632,7 +2632,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, int status; if (pnfs_ld_layoutret_on_setattr(inode)) - pnfs_return_layout(inode); + pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5044142c121..4bdffe0ba02 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -866,6 +866,33 @@ out: } EXPORT_SYMBOL_GPL(_pnfs_return_layout); +int +pnfs_commit_and_return_layout(struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + int ret; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo == NULL) { + spin_unlock(&inode->i_lock); + return 0; + } + pnfs_get_layout_hdr(lo); + /* Block new layoutgets and read/write to ds */ + lo->plh_block_lgets++; + spin_unlock(&inode->i_lock); + filemap_fdatawait(inode->i_mapping); + ret = pnfs_layoutcommit_inode(inode, true); + if (ret == 0) + ret = _pnfs_return_layout(inode); + spin_lock(&inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&inode->i_lock); + pnfs_put_layout_hdr(lo); + return ret; +} + bool pnfs_roc(struct inode *ino) { struct pnfs_layout_hdr *lo; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 94ba8041774..f5f8a470a64 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -219,6 +219,7 @@ void pnfs_set_layoutcommit(struct nfs_write_data *wdata); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); +int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_write_data *); void pnfs_ld_read_done(struct nfs_read_data *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, @@ -407,6 +408,11 @@ static inline int pnfs_return_layout(struct inode *ino) return 0; } +static inline int pnfs_commit_and_return_layout(struct inode *inode) +{ + return 0; +} + static inline bool pnfs_ld_layoutret_on_setattr(struct inode *inode) { -- cgit v1.2.3-70-g09d2 From 06ae43f34bcc07a0b6be8bf78a1c895bcd12c839 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Mar 2013 13:19:30 -0400 Subject: Don't bother with redoing rw_verify_area() from default_file_splice_from() default_file_splice_from() ends up calling vfs_write() (via very convoluted callchain). It's an overkill, since we already have done rw_verify_area() in the caller by the time we call vfs_write() we are under set_fs(KERNEL_DS), so access_ok() is also pointless. Add a new helper (__kernel_write()), use it instead of kernel_write() in there. Signed-off-by: Al Viro --- fs/internal.h | 5 +++++ fs/read_write.c | 25 +++++++++++++++++++++++++ fs/splice.c | 4 +++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 507141fceb9..4be78237d89 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,3 +125,8 @@ extern int invalidate_inodes(struct super_block *, bool); * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); + +/* + * read_write.c + */ +extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); diff --git a/fs/read_write.c b/fs/read_write.c index a698eff457f..f7b5a23b804 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -17,6 +17,7 @@ #include #include #include "read_write.h" +#include "internal.h" #include #include @@ -417,6 +418,30 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof EXPORT_SYMBOL(do_sync_write); +ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + const char __user *p; + ssize_t ret; + + old_fs = get_fs(); + set_fs(get_ds()); + p = (__force const char __user *)buf; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->write) + ret = file->f_op->write(file, p, count, pos); + else + ret = do_sync_write(file, p, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_modify(file); + add_wchar(current, ret); + } + inc_syscw(current); + return ret; +} + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; diff --git a/fs/splice.c b/fs/splice.c index 718bd005638..29e394e49dd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -31,6 +31,7 @@ #include #include #include +#include "internal.h" /* * Attempt to steal a page from a pipe buffer. This should perhaps go into @@ -1048,9 +1049,10 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, { int ret; void *data; + loff_t tmp = sd->pos; data = buf->ops->map(pipe, buf, 0); - ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); buf->ops->unmap(pipe, buf, data); return ret; -- cgit v1.2.3-70-g09d2 From f853c616883a8de966873a1dab283f1369e275a1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Mar 2013 09:52:19 -0400 Subject: cifs: ignore everything in SPNEGO blob after mechTypes We've had several reports of people attempting to mount Windows 8 shares and getting failures with a return code of -EINVAL. The default sec= mode changed recently to sec=ntlmssp. With that, we expect and parse a SPNEGO blob from the server in the NEGOTIATE reply. The current decode_negTokenInit function first parses all of the mechTypes and then tries to parse the rest of the negTokenInit reply. The parser however currently expects a mechListMIC or nothing to follow the mechTypes, but Windows 8 puts a mechToken field there instead to carry some info for the new NegoEx stuff. In practice, we don't do anything with the fields after the mechTypes anyway so I don't see any real benefit in continuing to parse them. This patch just has the kernel ignore the fields after the mechTypes. We'll probably need to reinstate some of this if we ever want to support NegoEx. Reported-by: Jason Burgess Reported-by: Yan Li Signed-off-by: Jeff Layton Cc: Signed-off-by: Steve French --- fs/cifs/asn1.c | 53 +++++------------------------------------------------ 1 file changed, 5 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0b..1d36db11477 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -614,53 +614,10 @@ decode_negTokenInit(unsigned char *security_blob, int length, } } - /* mechlistMIC */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - /* Check if we have reached the end of the blob, but with - no mechListMic (e.g. NTLMSSP instead of KRB5) */ - if (ctx.error == ASN1_ERR_DEC_EMPTY) - goto decode_negtoken_exit; - cFYI(1, "Error decoding last part negTokenInit exit3"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - /* tag = 3 indicating mechListMIC */ - cFYI(1, "Exit 4 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit5"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cFYI(1, "cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - } - - /* sequence of */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit 7"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) { - cFYI(1, "Exit 8 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - - /* general string */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, "Error decoding last part negTokenInit exit9"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_PRI) - || (tag != ASN1_GENSTR)) { - cFYI(1, "Exit10 cls = %d con = %d tag = %d end = %p (%d)", - cls, con, tag, end, *end); - return 0; - } - cFYI(1, "Need to call asn1_octets_decode() function for %s", - ctx.pointer); /* is this UTF-8 or ASCII? */ -decode_negtoken_exit: + /* + * We currently ignore anything at the end of the SPNEGO blob after + * the mechTypes have been parsed, since none of that info is + * used at the moment. + */ return 1; } -- cgit v1.2.3-70-g09d2 From d763448286377b8a0e3f179372e9e292bef3c337 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 11 Mar 2013 09:20:00 +0000 Subject: Btrfs: update to use fs_state bit Now that we use bit operation to check fs_state, update btrfs_free_fs_root()'s checker, otherwise we get back to memory leak case. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7d84651e850..127b23e8323 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3253,7 +3253,7 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); btrfs_free_log_root_tree(NULL, fs_info); } -- cgit v1.2.3-70-g09d2 From 835d974fabfa9bff4d173ad03c054ac2f673263f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 19 Mar 2013 12:13:25 -0400 Subject: Btrfs: handle a bogus chunk tree nicely If you restore a btrfs-image file system and try to mount that file system we'll panic. That's because btrfs-image restores and just makes one big chunk to envelope the whole disk, since they are really only meant to be messed with by our btrfs-progs. So fix up btrfs_rmap_block and the callers of it for mount so that we no longer panic but instead just return an error and fail to mount. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 35 ++++++++++++++++++++++++++++++----- fs/btrfs/volumes.c | 13 ++++++++++++- 2 files changed, 42 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 350b9b18140..a8ff25aedca 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -257,7 +257,8 @@ static int exclude_super_stripes(struct btrfs_root *root, cache->bytes_super += stripe_len; ret = add_excluded_extent(root, cache->key.objectid, stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -265,13 +266,17 @@ static int exclude_super_stripes(struct btrfs_root *root, ret = btrfs_rmap_block(&root->fs_info->mapping_tree, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; while (nr--) { cache->bytes_super += stripe_len; ret = add_excluded_extent(root, logical[nr], stripe_len); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + kfree(logical); + return ret; + } } kfree(logical); @@ -7964,7 +7969,17 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + goto error; + } /* * check for two cases, either we are full, and therefore @@ -8106,7 +8121,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - exclude_super_stripes(root, cache); + ret = exclude_super_stripes(root, cache); + if (ret) { + /* + * We may have excluded something, so call this just in + * case. + */ + free_excluded_extents(root, cache); + kfree(cache->free_space_ctl); + kfree(cache); + return ret; + } add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f..2854c824ab6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4935,7 +4935,18 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, em = lookup_extent_mapping(em_tree, chunk_start, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start != chunk_start); + if (!em) { + printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n", + chunk_start); + return -EIO; + } + + if (em->start != chunk_start) { + printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n", + em->start, chunk_start); + free_extent_map(em); + return -EIO; + } map = (struct map_lookup *)em->bdev; length = em->len; -- cgit v1.2.3-70-g09d2 From 6113077cd319e747875ec71227d2b5cb54e08c76 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 19 Mar 2013 10:57:14 +0000 Subject: Btrfs: fix missing qgroup reservation before fallocating Steps to reproduce: mkfs.btrfs mount btrfs quota enable btrfs sub create /subv btrfs qgroup limit 10M /subv fallocate --length 20M /subv/data For the above example, fallocating will return successfully which is not expected, we try to fix it by doing qgroup reservation before fallocating. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7bdb47faa12..1be25b92d63 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2142,6 +2142,7 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file->f_path.dentry->d_inode; struct extent_state *cached_state = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2169,6 +2170,11 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) return ret; + if (root->fs_info->quota_enabled) { + ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); + if (ret) + goto out_reserve_fail; + } /* * wait for ordered IO before we have any locks. We'll loop again @@ -2272,6 +2278,9 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); + if (root->fs_info->quota_enabled) + btrfs_qgroup_free(root, alloc_end - alloc_start); +out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; -- cgit v1.2.3-70-g09d2 From d9abbf1c3131b679379762700201ae69367f3f62 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 20 Mar 2013 13:49:48 +0000 Subject: Btrfs: fix locking on ROOT_REPLACE operations in tree mod log To resolve backrefs, ROOT_REPLACE operations in the tree mod log are required to be tied to at least one KEY_REMOVE_WHILE_FREEING operation. Therefore, those operations must be enclosed by tree_mod_log_write_lock() and tree_mod_log_write_unlock() calls. Those calls are private to the tree_mod_log_* functions, which means that removal of the elements of an old root node must be logged from tree_mod_log_insert_root. This partly reverts and corrects commit ba1bfbd5 (Btrfs: fix a tree mod logging issue for root replacement operations). This fixes the brand-new version of xfstest 276 as of commit cfe73f71. Cc: stable@vger.kernel.org Signed-off-by: Jan Schmidt Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecd25a1b4e5..ca9d8f1a3bb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -651,6 +651,8 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (tree_mod_dont_log(fs_info, NULL)) return 0; + __tree_mod_log_free_eb(fs_info, old_root); + ret = tree_mod_alloc(fs_info, flags, &tm); if (ret < 0) goto out; @@ -736,7 +738,7 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) static noinline void tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) + unsigned long src_offset, int nr_items, int log_removal) { int ret; int i; @@ -750,10 +752,12 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, } for (i = 0; i < nr_items; i++) { - ret = tree_mod_log_insert_key_locked(fs_info, src, - i + src_offset, - MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (log_removal) { + ret = tree_mod_log_insert_key_locked(fs_info, src, + i + src_offset, + MOD_LOG_KEY_REMOVE); + BUG_ON(ret < 0); + } ret = tree_mod_log_insert_key_locked(fs_info, dst, i + dst_offset, MOD_LOG_KEY_ADD); @@ -927,7 +931,6 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1, 1); BUG_ON(ret); /* -ENOMEM */ } - tree_mod_log_free_eb(root->fs_info, buf); clean_tree_block(trans, root, buf); *last_ref = 1; } @@ -1046,6 +1049,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); + tree_mod_log_free_eb(root->fs_info, buf); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); } @@ -1750,7 +1754,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - tree_mod_log_free_eb(root->fs_info, root->node); tree_mod_log_set_root_pointer(root, child); rcu_assign_pointer(root->node, child); @@ -2995,7 +2998,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, push_items = min(src_nritems - 8, push_items); tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, - push_items); + push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_nritems), btrfs_node_key_ptr_offset(0), @@ -3066,7 +3069,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, sizeof(struct btrfs_key_ptr)); tree_mod_log_eb_copy(root->fs_info, dst, src, 0, - src_nritems - push_items, push_items); + src_nritems - push_items, push_items, 1); copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(src_nritems - push_items), @@ -3218,12 +3221,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, int mid; int ret; u32 c_nritems; + int tree_mod_log_removal = 1; c = path->nodes[level]; WARN_ON(btrfs_header_generation(c) != trans->transid); if (c == root->node) { /* trying to split the root, lets make a new one */ ret = insert_new_root(trans, root, path, level + 1); + /* + * removal of root nodes has been logged by + * tree_mod_log_set_root_pointer due to locking + */ + tree_mod_log_removal = 0; if (ret) return ret; } else { @@ -3261,7 +3270,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); + tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid, + tree_mod_log_removal); copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(mid), -- cgit v1.2.3-70-g09d2 From 1dd05682b3ef6e70409e130bfd83e91770801589 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 21 Mar 2013 04:32:32 +0000 Subject: Btrfs: fix memory leak in btrfs_create_tree() We should free leaf and root before returning from the error handling code. Signed-off-by: Tsutomu Itoh Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 127b23e8323..6d19a0a554a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1291,6 +1291,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); + leaf = NULL; goto fail; } @@ -1334,11 +1335,16 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_tree_unlock(leaf); + return root; + fail: - if (ret) - return ERR_PTR(ret); + if (leaf) { + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + } + kfree(root); - return root; + return ERR_PTR(ret); } static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, -- cgit v1.2.3-70-g09d2 From 51f0885e5415b4cc6535e9cdcc5145bfbc134353 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 22 Mar 2013 11:44:04 -0700 Subject: vfs,proc: guarantee unique inodes in /proc Dave Jones found another /proc issue with his Trinity tool: thanks to the namespace model, we can have multiple /proc dentries that point to the same inode, aliasing directories in /proc//net/ for example. This ends up being a total disaster, because it acts like hardlinked directories, and causes locking problems. We rely on the topological sort of the inodes pointed to by dentries, and if we have aliased directories, that odering becomes unreliable. In short: don't do this. Multiple dentries with the same (directory) inode is just a bad idea, and the namespace code should never have exposed things this way. But we're kind of stuck with it. This solves things by just always allocating a new inode during /proc dentry lookup, instead of using "iget_locked()" to look up existing inodes by superblock and number. That actually simplies the code a bit, at the cost of potentially doing more inode [de]allocations. That said, the inode lookup wasn't free either (and did a lot of locking of inodes), so it is probably not that noticeable. We could easily keep the old lookup model for non-directory entries, but rather than try to be excessively clever this just implements the minimal and simplest workaround for the problem. Reported-and-tested-by: Dave Jones Analyzed-by: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a86aebc9ba7..869116c2afb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -446,9 +446,10 @@ static const struct file_operations proc_reg_file_ops_no_compat = { struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { - struct inode *inode = iget_locked(sb, de->low_ino); + struct inode *inode = new_inode_pseudo(sb); - if (inode && (inode->i_state & I_NEW)) { + if (inode) { + inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; @@ -476,7 +477,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_fop = de->proc_fops; } } - unlock_new_inode(inode); } else pde_put(de); return inode; -- cgit v1.2.3-70-g09d2 From e49dbbf3e770aa590a8a464ac4978a09027060b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 22 Mar 2013 11:18:24 -0700 Subject: nfsd: fix bad offset use vfs_writev() updates the offset argument - but the code then passes the offset to vfs_fsync_range(). Since offset now points to the offset after what was just written, this is probably not what was intended Introduced by face15025ffdf664de95e86ae831544154d26c9c "nfsd: use vfs_fsync_range(), not O_SYNC, for stable writes". Signed-off-by: Kent Overstreet Cc: Al Viro Cc: "Eric W. Biederman" Cc: stable@vger.kernel.org Reviewed-by: Zach Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2a7eb536de0..2b2e2396a86 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1013,6 +1013,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int host_err; int stable = *stablep; int use_wgather; + loff_t pos = offset; dentry = file->f_path.dentry; inode = dentry->d_inode; @@ -1025,7 +1026,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); + host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &pos); set_fs(oldfs); if (host_err < 0) goto out_nfserr; -- cgit v1.2.3-70-g09d2 From 4adaa611020fa6ac65b0ac8db78276af4ec04e63 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Mar 2013 13:07:00 -0400 Subject: Btrfs: fix race between mmap writes and compression Btrfs uses page_mkwrite to ensure stable pages during crc calculations and mmap workloads. We call clear_page_dirty_for_io before we do any crcs, and this forces any application with the file mapped to wait for the crc to finish before it is allowed to change the file. With compression on, the clear_page_dirty_for_io step is happening after we've compressed the pages. This means the applications might be changing the pages while we are compressing them, and some of those modifications might not hit the disk. This commit adds the clear_page_dirty_for_io before compression starts and makes sure to redirty the page if we have to fallback to uncompressed IO as well. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva cc: stable@vger.kernel.org --- fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/inode.c | 14 ++++++++++++++ 3 files changed, 49 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af646..cdee391fc7b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1257,6 +1257,39 @@ int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) GFP_NOFS); } +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + clear_page_dirty_for_io(page); + page_cache_release(page); + index++; + } + return 0; +} + +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + BUG_ON(!page); /* Pages should be in the extent_io_tree */ + account_page_redirty(page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + return 0; +} + /* * helper function to set both pages and extents in the tree writeback */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6068a198556..258c9215685 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -325,6 +325,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long *map_len); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f26888825e..6a6e13c5308 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -353,6 +353,7 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; int compress_type = root->fs_info->compress_type; + int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ if ((end - start + 1) < 16 * 1024 && @@ -415,6 +416,17 @@ again: if (BTRFS_I(inode)->force_compress) compress_type = BTRFS_I(inode)->force_compress; + /* + * we need to call clear_page_dirty_for_io on each + * page in the range. Otherwise applications with the file + * mmap'd can wander in and change the page contents while + * we are compressing them. + * + * If the compression fails for any reason, we set the pages + * dirty again later on. + */ + extent_range_clear_dirty_for_io(inode, start, end); + redirty = 1; ret = btrfs_compress_pages(compress_type, inode->i_mapping, start, total_compressed, pages, @@ -554,6 +566,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } + if (redirty) + extent_range_redirty_for_io(inode, start, end); add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; -- cgit v1.2.3-70-g09d2 From 64a817cfbded8674f345d1117b117f942a351a69 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 26 Mar 2013 14:11:13 -0400 Subject: nfsd4: reject "negative" acl lengths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we only enforce an upper bound, not a lower bound, a "negative" length can get through here. The symptom seen was a warning when we attempt to a kmalloc with an excessive size. Reported-by: Toralf Förster Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 01168865dd3..a2720071f28 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -264,7 +264,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, iattr->ia_valid |= ATTR_SIZE; } if (bmval[0] & FATTR4_WORD0_ACL) { - int nace; + u32 nace; struct nfs4_ace *ace; READ_BUF(4); len += 4; -- cgit v1.2.3-70-g09d2 From 7ea600b5314529f9d1b9d6d3c41cb26fce6a7a4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2013 18:25:57 -0400 Subject: Nest rename_lock inside vfsmount_lock ... lest we get livelocks between path_is_under() and d_path() and friends. The thing is, wrt fairness lglocks are more similar to rwsems than to rwlocks; it is possible to have thread B spin on attempt to take lock shared while thread A is already holding it shared, if B is on lower-numbered CPU than A and there's a thread C spinning on attempt to take the same lock exclusive. As the result, we need consistent ordering between vfsmount_lock (lglock) and rename_lock (seq_lock), even though everything that takes both is going to take vfsmount_lock only shared. Spotted-by: Brad Spengler Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index fbfae008ba4..e8bc3420d63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2542,7 +2542,6 @@ static int prepend_path(const struct path *path, bool slash = false; int error = 0; - br_read_lock(&vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -2572,8 +2571,6 @@ static int prepend_path(const struct path *path, if (!error && !slash) error = prepend(buffer, buflen, "/", 1); -out: - br_read_unlock(&vfsmount_lock); return error; global_root: @@ -2590,7 +2587,7 @@ global_root: error = prepend(buffer, buflen, "/", 1); if (!error) error = is_mounted(vfsmnt) ? 1 : 2; - goto out; + return error; } /** @@ -2617,9 +2614,11 @@ char *__d_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) return ERR_PTR(error); @@ -2636,9 +2635,11 @@ char *d_absolute_path(const struct path *path, int error; prepend(&res, &buflen, "\0", 1); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = prepend_path(path, &root, &res, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error > 1) error = -EINVAL; @@ -2702,11 +2703,13 @@ char *d_path(const struct path *path, char *buf, int buflen) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); error = path_with_deleted(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) res = ERR_PTR(error); - write_sequnlock(&rename_lock); path_put(&root); return res; } @@ -2830,6 +2833,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) get_fs_root_and_pwd(current->fs, &root, &pwd); error = -ENOENT; + br_read_lock(&vfsmount_lock); write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; @@ -2839,6 +2843,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) prepend(&cwd, &buflen, "\0", 1); error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); if (error < 0) goto out; @@ -2859,6 +2864,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) } } else { write_sequnlock(&rename_lock); + br_read_unlock(&vfsmount_lock); } out: -- cgit v1.2.3-70-g09d2 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..a3035223d42 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24..2b93a9a5a1e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d34204..0f1e4288457 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3-70-g09d2 From 90563b198e4c6674c63672fae1923da467215f45 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 03:10:15 -0700 Subject: vfs: Add a mount flag to lock read only bind mounts When a read-only bind mount is copied from mount namespace in a higher privileged user namespace to a mount namespace in a lesser privileged user namespace, it should not be possible to remove the the read-only restriction. Add a MNT_LOCK_READONLY mount flag to indicate that a mount must remain read-only. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 3 +++ include/linux/mount.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index a3035223d42..8505b5ece5d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,6 +1713,9 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else diff --git a/include/linux/mount.h b/include/linux/mount.h index d7029f4a191..73005f9957e 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,6 +47,8 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_READONLY 0x400000 + struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ -- cgit v1.2.3-70-g09d2 From 132c94e31b8bca8ea921f9f96a57d684fa4ae0a9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Mar 2013 04:08:05 -0700 Subject: vfs: Carefully propogate mounts across user namespaces As a matter of policy MNT_READONLY should not be changable if the original mounter had more privileges than creator of the mount namespace. Add the flag CL_UNPRIVILEGED to note when we are copying a mount from a mount namespace that requires more privileges to a mount namespace that requires fewer privileges. When the CL_UNPRIVILEGED flag is set cause clone_mnt to set MNT_NO_REMOUNT if any of the mnt flags that should never be changed are set. This protects both mount propagation and the initial creation of a less privileged mount namespace. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Reported-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 6 +++++- fs/pnode.c | 6 ++++++ fs/pnode.h | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 8505b5ece5d..968d4c5eae0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -798,6 +798,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); @@ -2342,7 +2346,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_ALL | CL_EXPIRE; if (user_ns != mnt_ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { up_write(&namespace_sem); diff --git a/fs/pnode.c b/fs/pnode.c index 3e000a51ac0..8b29d2164da 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" #include "pnode.h" @@ -220,6 +221,7 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, struct mount *source_mnt, struct list_head *tree_list) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mount *m, *child; int ret = 0; struct mount *prev_dest_mnt = dest_mnt; @@ -237,6 +239,10 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry, source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(source, source->mnt.mnt_root, type); if (IS_ERR(child)) { ret = PTR_ERR(child); diff --git a/fs/pnode.h b/fs/pnode.h index 19b853a3445..a0493d5ebfb 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -23,6 +23,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 +#define CL_UNPRIVILEGED 0x40 static inline void set_mnt_shared(struct mount *mnt) { -- cgit v1.2.3-70-g09d2 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae0..d581e45c0a9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26ba..9c7fab1d23f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec73..afd83273e6c 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce00932493..b6b215f13b4 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03..8e635a18ab5 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e4288457..a54f26f82eb 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3-70-g09d2 From 3e84f48edfd33b2e209a117c11fb9ce637cc9b67 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Mar 2013 15:20:30 +0000 Subject: vfs/splice: Fix missed checks in new __kernel_write() helper Commit 06ae43f34bcc ("Don't bother with redoing rw_verify_area() from default_file_splice_from()") lost the checks to test existence of the write/aio_write methods. My apologies ;-/ Eventually, we want that in fs/splice.c side of things (no point repeating it for every buffer, after all), but for now this is the obvious minimal fix. Reported-by: Dave Jones Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/read_write.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index f7b5a23b804..e6ddc8dceb9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -424,6 +424,9 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t const char __user *p; ssize_t ret; + if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write)) + return -EINVAL; + old_fs = get_fs(); set_fs(get_ds()); p = (__force const char __user *)buf; -- cgit v1.2.3-70-g09d2 From adaa4b8e4d47eeb114513c2f7a172929154b94bd Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 21 Mar 2013 14:30:23 +0000 Subject: Btrfs: fix EIO from btrfs send in is_extent_unchanged for punched holes When you take a snapshot, punch a hole where there has been data, then take another snapshot and try to send an incremental stream, btrfs send would give you EIO. That is because is_extent_unchanged had no support for holes being punched. With this patch, instead of returning EIO we just return 0 (== the extent is not unchanged) and we're good. Signed-off-by: Jan Schmidt Cc: Alexander Block Signed-off-by: Josef Bacik --- fs/btrfs/send.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 68da757615a..ed897dc1135 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3945,12 +3945,10 @@ static int is_extent_unchanged(struct send_ctx *sctx, found_key.type != key.type) { key.offset += right_len; break; - } else { - if (found_key.offset != key.offset + right_len) { - /* Should really not happen */ - ret = -EIO; - goto out; - } + } + if (found_key.offset != key.offset + right_len) { + ret = 0; + goto out; } key = found_key; } -- cgit v1.2.3-70-g09d2 From f4881bc7a83eff263789dd524b7c269d138d4af5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Mar 2013 16:03:35 -0400 Subject: Btrfs: fix space leak when we fail to reserve metadata space Dave reported a warning when running xfstest 275. We have been leaking delalloc metadata space when our reservations fail. This is because we were improperly calculating how much space to free for our checksum reservations. The problem is we would sometimes free up space that had already been freed in another thread and we would end up with negative usage for the delalloc space. This patch fixes the problem by calculating how much space the other threads would have already freed, and then calculate how much space we need to free had we not done the reservation at all, and then freeing any excess space. This makes xfstests 275 no longer have leaked space. Thanks Cc: stable@vger.kernel.org Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a8ff25aedca..a22b5cc921a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4815,14 +4815,49 @@ out_fail: * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers * so we can just reduce our inodes csum bytes and carry on. - * Otherwise we have to do the normal free thing to account for - * the case that the free side didn't free up its reserve - * because of this outstanding reservation. */ - if (BTRFS_I(inode)->csum_bytes == csum_bytes) + if (BTRFS_I(inode)->csum_bytes == csum_bytes) { calc_csum_metadata_size(inode, num_bytes, 0); - else - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + } else { + u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes; + u64 bytes; + + /* + * This is tricky, but first we need to figure out how much we + * free'd from any free-ers that occured during this + * reservation, so we reset ->csum_bytes to the csum_bytes + * before we dropped our lock, and then call the free for the + * number of bytes that were freed while we were trying our + * reservation. + */ + bytes = csum_bytes - BTRFS_I(inode)->csum_bytes; + BTRFS_I(inode)->csum_bytes = csum_bytes; + to_free = calc_csum_metadata_size(inode, bytes, 0); + + + /* + * Now we need to see how much we would have freed had we not + * been making this reservation and our ->csum_bytes were not + * artificially inflated. + */ + BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes; + bytes = csum_bytes - orig_csum_bytes; + bytes = calc_csum_metadata_size(inode, bytes, 0); + + /* + * Now reset ->csum_bytes to what it should be. If bytes is + * more than to_free then we would have free'd more space had we + * not had an artificially high ->csum_bytes, so we need to free + * the remainder. If bytes is the same or less then we don't + * need to do anything, the other free-ers did the correct + * thing. + */ + BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes; + if (bytes > to_free) + to_free = bytes - to_free; + else + to_free = 0; + } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) to_free += btrfs_calc_trans_metadata_size(root, dropped); -- cgit v1.2.3-70-g09d2 From 6e137ed3f30574f314733d4b7a86ea6523232b14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:26:55 -0400 Subject: Btrfs: fix space accounting for unlink and rename We are way over-reserving for unlink and rename. Rename is just some random huge number and unlink accounts for tree log operations that don't actually happen during unlink, not to mention the tree log doesn't take from the trans block rsv anyway so it's completely useless. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6a6e13c5308..8cab424c75f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3693,11 +3693,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, * 1 for the dir item * 1 for the dir index * 1 for the inode ref - * 1 for the inode ref in the tree log - * 2 for the dir entries in the log * 1 for the inode */ - trans = btrfs_start_transaction(root, 8); + trans = btrfs_start_transaction(root, 5); if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) return trans; @@ -8141,7 +8139,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ - trans = btrfs_start_transaction(root, 20); + trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; -- cgit v1.2.3-70-g09d2 From db1d607d3ca5cbb283cbb17d648cd7e8dc67cc7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:29:11 -0400 Subject: Btrfs: hold the ordered operations mutex when waiting on ordered extents We need to hold the ordered_operations mutex while waiting on ordered extents since we splice and run the ordered extents list. We need to make sure anybody else who wants to wait on ordered extents does actually wait for them to be completed. This will keep us from bailing out of flushing in case somebody is already waiting on ordered extents to complete. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ordered-data.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc08d77b717..005c45db699 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -557,6 +557,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) INIT_LIST_HEAD(&splice); INIT_LIST_HEAD(&works); + mutex_lock(&root->fs_info->ordered_operations_mutex); spin_lock(&root->fs_info->ordered_extent_lock); list_splice_init(&root->fs_info->ordered_extents, &splice); while (!list_empty(&splice)) { @@ -600,6 +601,7 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) cond_resched(); } + mutex_unlock(&root->fs_info->ordered_operations_mutex); } /* -- cgit v1.2.3-70-g09d2 From fdf30d1c1b386e1b73116cc7e0fb14e962b763b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Mar 2013 15:31:45 -0400 Subject: Btrfs: limit the global reserve to 512mb A user reported a problem where he was getting early ENOSPC with hundreds of gigs of free data space and 6 gigs of free metadata space. This is because the global block reserve was taking up the entire free metadata space. This is ridiculous, we have infrastructure in place to throttle if we start using too much of the global reserve, so instead of letting it get this huge just limit it to 512mb so that users can still get work done. This allowed the user to complete his rsync without issues. Thanks Cc: stable@vger.kernel.org Reported-and-tested-by: Stefan Priebe Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a22b5cc921a..0d8478700d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4460,7 +4460,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = num_bytes; + block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + -- cgit v1.2.3-70-g09d2 From a7975026ff9ddf91ba190ae2b71699dd156395e3 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 25 Mar 2013 11:08:23 +0000 Subject: Btrfs: fix double free in the btrfs_qgroup_account_ref() The function btrfs_find_all_roots is responsible to allocate memory for 'roots' and free it if errors happen,so the caller should not free it again since the work has been done. Besides,'tmp' is allocated after the function btrfs_find_all_roots, so we can return directly if btrfs_find_all_roots() fails. Signed-off-by: Wang Shilong Reviewed-by: Miao Xie Reviewed-by: Jan Schmidt Signed-off-by: Josef Bacik --- fs/btrfs/qgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5471e47d655..b44124dd237 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1153,7 +1153,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, sgn > 0 ? node->seq - 1 : node->seq, &roots); if (ret < 0) - goto out; + return ret; spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1275,7 +1275,6 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans, ret = 0; unlock: spin_unlock(&fs_info->qgroup_lock); -out: ulist_free(roots); ulist_free(tmp); -- cgit v1.2.3-70-g09d2 From 39847c4d3d91f487f9ab3d083ee5d0f8419f105c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:08:20 +0000 Subject: Btrfs: fix wrong reservation of csums We reserve the space for csums only when we write data into a file, in the other cases, such as tree log, log replay, we don't do reservation, so we can use the reservation of the transaction handle just for the former. And for the latter, we should use the tree's own reservation. But the function - btrfs_csum_file_blocks() didn't differentiate between these two types of the cases, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 2 -- fs/btrfs/inode.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ec160202be3..b7e529d2860 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -728,7 +728,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, return -ENOMEM; sector_sum = sums->sums; - trans->adding_csums = 1; again: next_offset = (u64)-1; found_next = 0; @@ -899,7 +898,6 @@ next_sector: goto again; } out: - trans->adding_csums = 0; btrfs_free_path(path); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cab424c75f..b88381582da 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1757,8 +1757,10 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sum; list_for_each_entry(sum, list, list) { + trans->adding_csums = 1; btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); + trans->adding_csums = 0; } return 0; } -- cgit v1.2.3-70-g09d2 From 82d130ff390be67d980d8b6f39e921c0b1d8d8e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 28 Mar 2013 08:12:15 +0000 Subject: Btrfs: fix wrong return value of btrfs_lookup_csum() If we don't find the expected csum item, but find a csum item which is adjacent to the specified extent, we should return -EFBIG, or we should return -ENOENT. But btrfs_lookup_csum() return -EFBIG even the csum item is not adjacent to the specified extent. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b7e529d2860..c4628a201cb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -118,9 +118,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; - if (csum_offset >= csums_in_item) { + if (csum_offset == csums_in_item) { ret = -EFBIG; goto fail; + } else if (csum_offset > csums_in_item) { + goto fail; } } item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); -- cgit v1.2.3-70-g09d2 From d8fe29e9dea8d7d61fd140d8779326856478fc62 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 29 Mar 2013 08:09:34 -0600 Subject: Btrfs: don't drop path when printing out tree errors in scrub A user reported a panic where we were panicing somewhere in tree_backref_for_extent from scrub_print_warning. He only captured the trace but looking at scrub_print_warning we drop the path right before we mess with the extent buffer to print out a bunch of stuff, which isn't right. So fix this by dropping the path after we use the eb if we need to. Thanks, Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 53c3501fa4c..85e072b956d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -542,7 +542,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) eb = path->nodes[0]; ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); item_size = btrfs_item_size_nr(eb, path->slots[0]); - btrfs_release_path(path); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { do { @@ -558,7 +557,9 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); } while (ret != 1); + btrfs_release_path(path); } else { + btrfs_release_path(path); swarn.path = path; swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, -- cgit v1.2.3-70-g09d2 From 35e5cbc0af240778e61113286c019837e06aeec6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 29 Mar 2013 15:39:16 +0100 Subject: reiserfs: Fix warning and inode leak when deleting inode with xattrs After commit 21d8a15a (lookup_one_len: don't accept . and ..) reiserfs started failing to delete xattrs from inode. This was due to a buggy test for '.' and '..' in fill_with_dentries() which resulted in passing '.' and '..' entries to lookup_one_len() in some cases. That returned error and so we failed to iterate over all xattrs of and inode. Fix the test in fill_with_dentries() along the lines of the one in lookup_one_len(). Reported-by: Pawel Zawora CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/reiserfs/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index c196369fe40..4cce1d9552f 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -187,8 +187,8 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; - if (name[0] == '.' && (name[1] == '\0' || - (name[1] == '.' && name[2] == '\0'))) + if (name[0] == '.' && (namelen < 2 || + (namelen == 2 && name[1] == '.'))) return 0; dentry = lookup_one_len(name, dbuf->xadir, namelen); -- cgit v1.2.3-70-g09d2 From c1681bf8a7b1b98edee8b862a42c19c4e53205fd Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Mon, 1 Apr 2013 09:47:56 -0700 Subject: loop: prevent bdev freeing while device in use struct block_device lifecycle is defined by its inode (see fs/block_dev.c) - block_device allocated first time we access /dev/loopXX and deallocated on bdev_destroy_inode. When we create the device "losetup /dev/loopXX afile" we want that block_device stay alive until we destroy the loop device with "losetup -d". But because we do not hold /dev/loopXX inode its counter goes 0, and inode/bdev can be destroyed at any moment. Usually it happens at memory pressure or when user drops inode cache (like in the test below). When later in loop_clr_fd() we want to use bdev we have use-after-free error with following stack: BUG: unable to handle kernel NULL pointer dereference at 0000000000000280 bd_set_size+0x10/0xa0 loop_clr_fd+0x1f8/0x420 [loop] lo_ioctl+0x200/0x7e0 [loop] lo_compat_ioctl+0x47/0xe0 [loop] compat_blkdev_ioctl+0x341/0x1290 do_filp_open+0x42/0xa0 compat_sys_ioctl+0xc1/0xf20 do_sys_open+0x16e/0x1d0 sysenter_dispatch+0x7/0x1a To prevent use-after-free we need to grab the device in loop_set_fd() and put it later in loop_clr_fd(). The issue is reprodusible on current Linus head and v3.3. Here is the test: dd if=/dev/zero of=loop.file bs=1M count=1 while [ true ]; do losetup /dev/loop0 loop.file echo 2 > /proc/sys/vm/drop_caches losetup -d /dev/loop0 done [ Doing bdgrab/bput in loop_set_fd/loop_clr_fd is safe, because every time we call loop_set_fd() we check that loop_device->lo_state is Lo_unbound and set it to Lo_bound If somebody will try to set_fd again it will get EBUSY. And if we try to loop_clr_fd() on unbound loop device we'll get ENXIO. loop_set_fd/loop_clr_fd (and any other loop ioctl) is called under loop_device->lo_ctl_mutex. ] Signed-off-by: Anatol Pomozov Cc: Al Viro Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 9 ++++++++- fs/block_dev.c | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fe5f6403417..2c127f9c3f3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -922,6 +922,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); + + /* Grab the block_device to prevent its destruction after we + * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). + */ + bdgrab(bdev); return 0; out_clr: @@ -1031,8 +1036,10 @@ static int loop_clr_fd(struct loop_device *lo) memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); - if (bdev) + if (bdev) { + bdput(bdev); invalidate_bdev(bdev); + } set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { diff --git a/fs/block_dev.c b/fs/block_dev.c index aea605c98ba..aae187a7f94 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -551,6 +551,7 @@ struct block_device *bdgrab(struct block_device *bdev) ihold(bdev->bd_inode); return bdev; } +EXPORT_SYMBOL(bdgrab); long nr_blockdev_pages(void) { -- cgit v1.2.3-70-g09d2 From 8cde7ad17e4f4ff8d12ff60dd09c0a291cb0b61c Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 3 Apr 2013 12:27:18 -0400 Subject: ext4: fix big-endian bugs which could cause fs corruptions When an extent was zeroed out, we forgot to do convert from cpu to le16. It could make us hit a BUG_ON when we try to write dirty pages out. So fix it. [ Also fix a bug found by Dmitry Monakhov where we were missing le32_to_cpu() calls in the new indirect punch hole code. There are a number of other big endian warnings found by static code analyzers, but we'll wait for the next merge window to fix them all up. These fixes are designed to be Obviously Correct by code inspection, and easy to demonstrate that it won't make any difference (and hence, won't introduce any bugs) on little endian architectures such as x86. --tytso ] Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Reported-by: CAI Qian Reported-by: Christian Kujau Cc: Dmitry Monakhov --- fs/ext4/extents.c | 11 +++++++---- fs/ext4/indirect.c | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 56efcaadf84..9c6d06dcef8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2999,20 +2999,23 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); zero_ex.ee_block = ex2->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex2)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex2)); } else { err = ext4_ext_zeroout(inode, ex); zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); } } else { err = ext4_ext_zeroout(inode, &orig_ex); zero_ex.ee_block = orig_ex.ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + zero_ex.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(&orig_ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(&orig_ex)); } @@ -3272,7 +3275,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (err) goto out; zero_ex.ee_block = ex->ee_block; - zero_ex.ee_len = ext4_ext_get_actual_len(ex); + zero_ex.ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index b505a145a59..a04183127ef 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1539,9 +1539,9 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode, blk = *i_data; if (level > 0) { ext4_lblk_t first2; - bh = sb_bread(inode->i_sb, blk); + bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, blk, + EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), "Read failure"); return -EIO; } -- cgit v1.2.3-70-g09d2 From 57c7310b8eb96b0fe3b0aaa8dc194adbae03bef3 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 5 Mar 2013 16:01:47 -0500 Subject: GFS2: use kmalloc for lvb bitmap The temp lvb bitmap was on the stack, which could be an alignment problem for __set_bit_le. Use kmalloc for it instead. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/lock_dlm.c | 31 ++++++++++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 156e42ec84e..5c29216e9cc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -588,6 +588,7 @@ struct lm_lockstruct { struct dlm_lksb ls_control_lksb; /* control_lock */ char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; spinlock_t ls_recover_spin; /* protects following fields */ unsigned long ls_recover_flags; /* DFL_ */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9802de0f85e..b15bb45911c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -580,7 +580,6 @@ static void gfs2_control_func(struct work_struct *work) { struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t block_gen, start_gen, lvb_gen, flags; int recover_set = 0; int write_lvb = 0; @@ -634,7 +633,7 @@ static void gfs2_control_func(struct work_struct *work) return; } - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); spin_lock(&ls->ls_recover_spin); if (block_gen != ls->ls_recover_block || @@ -664,10 +663,10 @@ static void gfs2_control_func(struct work_struct *work) ls->ls_recover_result[i] = 0; - if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) continue; - __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); write_lvb = 1; } } @@ -691,7 +690,7 @@ static void gfs2_control_func(struct work_struct *work) continue; if (ls->ls_recover_submit[i] < start_gen) { ls->ls_recover_submit[i] = 0; - __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET); + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); } } /* even if there are no bits to set, we need to write the @@ -705,7 +704,7 @@ static void gfs2_control_func(struct work_struct *work) spin_unlock(&ls->ls_recover_spin); if (write_lvb) { - control_lvb_write(ls, start_gen, lvb_bits); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; } else { flags = DLM_LKF_CONVERT; @@ -725,7 +724,7 @@ static void gfs2_control_func(struct work_struct *work) */ for (i = 0; i < recover_size; i++) { - if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { fs_info(sdp, "recover generation %u jid %d\n", start_gen, i); gfs2_recover_set(sdp, i); @@ -758,7 +757,6 @@ static void gfs2_control_func(struct work_struct *work) static int control_mount(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen, mount_gen, lvb_gen; int mounted_mode; int retries = 0; @@ -857,7 +855,7 @@ locks_done: * lvb_gen will be non-zero. */ - control_lvb_read(ls, &lvb_gen, lvb_bits); + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); if (lvb_gen == 0xFFFFFFFF) { /* special value to force mount attempts to fail */ @@ -887,7 +885,7 @@ locks_done: * and all lvb bits to be clear (no pending journal recoveries.) */ - if (!all_jid_bits_clear(lvb_bits)) { + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { /* journals need recovery, wait until all are clear */ fs_info(sdp, "control_mount wait for journal recovery\n"); goto restart; @@ -949,7 +947,6 @@ static int dlm_recovery_wait(void *word) static int control_first_done(struct gfs2_sbd *sdp) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - char lvb_bits[GDLM_LVB_SIZE]; uint32_t start_gen, block_gen; int error; @@ -991,8 +988,8 @@ restart: memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); spin_unlock(&ls->ls_recover_spin); - memset(lvb_bits, 0, sizeof(lvb_bits)); - control_lvb_write(ls, start_gen, lvb_bits); + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); if (error) @@ -1022,6 +1019,12 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, uint32_t old_size, new_size; int i, max_jid; + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + max_jid = 0; for (i = 0; i < num_slots; i++) { if (max_jid < slots[i].slot - 1) @@ -1057,6 +1060,7 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, static void free_recover_size(struct lm_lockstruct *ls) { + kfree(ls->ls_lvb_bits); kfree(ls->ls_recover_submit); kfree(ls->ls_recover_result); ls->ls_recover_submit = NULL; @@ -1205,6 +1209,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) ls->ls_recover_size = 0; ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; error = set_recover_size(sdp, NULL, 0); if (error) -- cgit v1.2.3-70-g09d2 From 4146c3d469dc400eefa253fb37aa1e74fb5e41f8 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 7 Mar 2013 23:42:52 +0900 Subject: GFS2: use memchr_inv Use memchr_inv to verify that the specified memory range is cleared. Signed-off-by: Akinobu Mita Cc: Steven Whitehouse Cc: cluster-devel@redhat.com Cc: Christine Caulfield Cc: David Teigland --- fs/gfs2/lock_dlm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b15bb45911c..c8423d6de6c 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -483,12 +483,8 @@ static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, static int all_jid_bits_clear(char *lvb) { - int i; - for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) { - if (lvb[i]) - return 0; - } - return 1; + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); } static void sync_wait_cb(void *arg) -- cgit v1.2.3-70-g09d2 From 441362d06be349430d06e37286adce4b90e6ce96 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 11 Mar 2013 23:01:37 +0800 Subject: GFS2: return error if malloc failed in gfs2_rs_alloc() The error code in gfs2_rs_alloc() is set to ENOMEM when error but never be used, instead, gfs2_rs_alloc() always return 0. Fix to return 'error'. Signed-off-by: Wei Yongjun Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index d1f51fd73f8..70d1cd0b5f3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -576,7 +576,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip) RB_CLEAR_NODE(&ip->i_res->rs_node); out: up_write(&ip->i_rw_mutex); - return 0; + return error; } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) -- cgit v1.2.3-70-g09d2 From c2952d202f710d326ac36a8ea6bd216b20615ec8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 14 Mar 2013 15:49:59 +0000 Subject: GFS2: Fix unlock of fcntl locks during withdrawn state When withdraw occurs, we need to continue to allow unlocks of fcntl locks to occur, however these will only be local, since the node has withdrawn from the cluster. This prevents triggering a VFS level bug trap due to locks remaining when a file is closed. Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 019f45e4509..d79c2dadc53 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -923,8 +923,11 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { + if (fl->fl_type == F_UNLCK) + posix_lock_file_wait(file, fl); return -EIO; + } if (IS_GETLK(cmd)) return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); else if (fl->fl_type == F_UNLCK) -- cgit v1.2.3-70-g09d2 From b2c87cae0edb1a99f7dd2751d5beb2cb97926514 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 22 Mar 2013 10:07:24 -0400 Subject: GFS2: Issue discards in 512b sectors This patch changes GFS2's discard issuing code so that it calls function sb_issue_discard rather than blkdev_issue_discard. The code was calling blkdev_issue_discard and specifying the correct sector offset and sector size, but blkdev_issue_discard expects these values to be in terms of 512 byte sectors, even if the native sector size for the device is different. Calling sb_issue_discard with the BLOCK size instead ensures the correct block-to-512b-sector translation. I verified that "minlen" is specified in blocks, so comparing it to a number of blocks is correct. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 70d1cd0b5f3..5a51265a434 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1181,12 +1181,9 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) { struct super_block *sb = sdp->sd_vfs; - struct block_device *bdev = sb->s_bdev; - const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / - bdev_logical_block_size(sb->s_bdev); u64 blk; sector_t start = 0; - sector_t nr_sects = 0; + sector_t nr_blks = 0; int rv; unsigned int x; u32 trimmed = 0; @@ -1206,35 +1203,34 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if (diff == 0) continue; blk = offset + ((bi->bi_start + x) * GFS2_NBBY); - blk *= sects_per_blk; /* convert to sectors */ while(diff) { if (diff & 1) { - if (nr_sects == 0) + if (nr_blks == 0) goto start_new_extent; - if ((start + nr_sects) != blk) { - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, - start, nr_sects, + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } - nr_sects = 0; + nr_blks = 0; start_new_extent: start = blk; } - nr_sects += sects_per_blk; + nr_blks++; } diff >>= 2; - blk += sects_per_blk; + blk++; } } - if (nr_sects >= minlen) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); if (rv) goto fail; - trimmed += nr_sects; + trimmed += nr_blks; } if (ptrimmed) *ptrimmed = trimmed; -- cgit v1.2.3-70-g09d2 From b193d59a4863ea670872d76dc99231ddeb598625 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Apr 2013 15:55:00 -0400 Subject: NFSv4: Fix a memory leak in nfs4_discover_server_trunking When we assign a new rpc_client to clp->cl_rpcclient, we need to destroy the old one. Signed-off-by: Trond Myklebust Cc: Chuck Lever Cc: stable@vger.kernel.org [>=3.7] --- fs/nfs/nfs4state.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6ace365c633..d41a3518509 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1886,7 +1886,13 @@ again: status = PTR_ERR(clnt); break; } - clp->cl_rpcclient = clnt; + /* Note: this is safe because we haven't yet marked the + * client as ready, so we are the only user of + * clp->cl_rpcclient + */ + clnt = xchg(&clp->cl_rpcclient, clnt); + rpc_shutdown_client(clnt); + clnt = clp->cl_rpcclient; goto again; case -NFS4ERR_MINOR_VERS_MISMATCH: -- cgit v1.2.3-70-g09d2 From 7b1f1fd1842e6ede25183c267ae733a7f67f00bc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Apr 2013 16:11:11 -0400 Subject: NFSv4/4.1: Fix bugs in nfs4[01]_walk_client_list It is unsafe to use list_for_each_entry_safe() here, because when we drop the nn->nfs_client_lock, we pin the _current_ list entry and ensure that it stays in the list, but we don't do the same for the _next_ list entry. Use of list_for_each_entry() is therefore the correct thing to do. Also fix the refcounting in nfs41_walk_client_list(). Finally, ensure that the nfs_client has finished being initialised and, in the case of NFSv4.1, that the session is set up. Signed-off-by: Trond Myklebust Cc: Chuck Lever Cc: Bryan Schumaker Cc: stable@vger.kernel.org [>= 3.7] --- fs/nfs/nfs4client.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index ac4fc9a8fdb..c7b346f8cc4 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -300,7 +300,7 @@ int nfs40_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; struct nfs4_setclientid_res clid = { .clientid = new->cl_clientid, .confirm = new->cl_confirm, @@ -308,10 +308,23 @@ int nfs40_walk_client_list(struct nfs_client *new, int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ - if (pos->cl_cons_state < NFS_CS_READY) + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + if (prev) + nfs_put_client(prev); + prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; + } + if (pos->cl_cons_state != NFS_CS_READY) continue; if (pos->rpc_ops != new->rpc_ops) @@ -423,16 +436,16 @@ int nfs41_walk_client_list(struct nfs_client *new, struct rpc_cred *cred) { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); - struct nfs_client *pos, *n, *prev = NULL; + struct nfs_client *pos, *prev = NULL; int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); - list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { + list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ - if (pos->cl_cons_state < NFS_CS_READY) { + if (pos->cl_cons_state > NFS_CS_READY) { atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -440,18 +453,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - nfs4_schedule_lease_recovery(pos); status = nfs_wait_client_init_complete(pos); - if (status < 0) { - nfs_put_client(pos); - spin_lock(&nn->nfs_client_lock); - continue; + if (status == 0) { + nfs4_schedule_lease_recovery(pos); + status = nfs4_wait_clnt_recover(pos); } - status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); if (status < 0) continue; } + if (pos->cl_cons_state != NFS_CS_READY) + continue; if (pos->rpc_ops != new->rpc_ops) continue; @@ -469,17 +481,17 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); + *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - - *result = pos; - return 0; + break; } /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); + if (prev) + nfs_put_client(prev); return status; } #endif /* CONFIG_NFS_V4_1 */ -- cgit v1.2.3-70-g09d2 From fa332941c0c7c00e3420078268b7558d0ef792b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Apr 2013 12:56:52 -0400 Subject: NFSv4: Fix another potential state manager deadlock Don't hold the NFSv4 sequence id while we check for open permission. The call to ACCESS may block due to reboot recovery. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 26431cf62dd..0ad025eb523 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1046,6 +1046,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) /* Save the delegation */ nfs4_stateid_copy(&stateid, &delegation->stateid); rcu_read_unlock(); + nfs_release_seqid(opendata->o_arg.seqid); ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); if (ret != 0) goto out; -- cgit v1.2.3-70-g09d2 From 52f21999c7b921a0390708b66ed286282c2e4bee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 Mar 2013 13:30:23 -0400 Subject: ecryptfs: close rmmod race Signed-off-by: Al Viro --- fs/ecryptfs/miscdev.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 412e6eda25f..e4141f25749 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -80,13 +80,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = try_module_get(THIS_MODULE); - if (rc == 0) { - rc = -EIO; - printk(KERN_ERR "%s: Error attempting to increment module use " - "count; rc = [%d]\n", __func__, rc); - goto out_unlock_daemon_list; - } rc = ecryptfs_find_daemon_by_euid(&daemon); if (!rc) { rc = -EINVAL; @@ -96,7 +89,7 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) if (rc) { printk(KERN_ERR "%s: Error attempting to spawn daemon; " "rc = [%d]\n", __func__, rc); - goto out_module_put_unlock_daemon_list; + goto out_unlock_daemon_list; } mutex_lock(&daemon->mux); if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { @@ -108,9 +101,6 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file) atomic_inc(&ecryptfs_num_miscdev_opens); out_unlock_daemon: mutex_unlock(&daemon->mux); -out_module_put_unlock_daemon_list: - if (rc) - module_put(THIS_MODULE); out_unlock_daemon_list: mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; @@ -147,7 +137,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file) "bug.\n", __func__, rc); BUG(); } - module_put(THIS_MODULE); return rc; } @@ -471,6 +460,7 @@ out_free: static const struct file_operations ecryptfs_miscdev_fops = { + .owner = THIS_MODULE, .open = ecryptfs_miscdev_open, .poll = ecryptfs_miscdev_poll, .read = ecryptfs_miscdev_read, -- cgit v1.2.3-70-g09d2 From 8ce584c7416d8a85a6f3edc17d1cddefe331e87e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Mar 2013 20:13:46 -0400 Subject: procfs: add proc_remove_subtree() just what it sounds like; do that only to procfs subtrees you've created - doing that to something shared with another driver is not only antisocial, but might cause interesting races with proc_create() and its ilk. Signed-off-by: Al Viro --- fs/proc/generic.c | 119 ++++++++++++++++++++++++++++++++++++------------ include/linux/proc_fs.h | 2 + 2 files changed, 91 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4b3b3ffb52f..21e1a8f1659 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -755,37 +755,8 @@ void pde_put(struct proc_dir_entry *pde) free_proc_entry(pde); } -/* - * Remove a /proc entry and free it if it's not currently in use. - */ -void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +static void entry_rundown(struct proc_dir_entry *de) { - struct proc_dir_entry **p; - struct proc_dir_entry *de = NULL; - const char *fn = name; - unsigned int len; - - spin_lock(&proc_subdir_lock); - if (__xlate_proc_name(name, &parent, &fn) != 0) { - spin_unlock(&proc_subdir_lock); - return; - } - len = strlen(fn); - - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - de = *p; - *p = de->next; - de->next = NULL; - break; - } - } - spin_unlock(&proc_subdir_lock); - if (!de) { - WARN(1, "name '%s'\n", name); - return; - } - spin_lock(&de->pde_unload_lock); /* * Stop accepting new callers into module. If you're @@ -817,6 +788,40 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) spin_lock(&de->pde_unload_lock); } spin_unlock(&de->pde_unload_lock); +} + +/* + * Remove a /proc entry and free it if it's not currently in use. + */ +void remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *de = NULL; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + de = *p; + *p = de->next; + de->next = NULL; + break; + } + } + spin_unlock(&proc_subdir_lock); + if (!de) { + WARN(1, "name '%s'\n", name); + return; + } + + entry_rundown(de); if (S_ISDIR(de->mode)) parent->nlink--; @@ -827,3 +832,57 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); + +int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) +{ + struct proc_dir_entry **p; + struct proc_dir_entry *root = NULL, *de, *next; + const char *fn = name; + unsigned int len; + + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + len = strlen(fn); + + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (proc_match(len, fn, *p)) { + root = *p; + *p = root->next; + root->next = NULL; + break; + } + } + if (!root) { + spin_unlock(&proc_subdir_lock); + return -ENOENT; + } + de = root; + while (1) { + next = de->subdir; + if (next) { + de->subdir = next->next; + next->next = NULL; + de = next; + continue; + } + spin_unlock(&proc_subdir_lock); + + entry_rundown(de); + next = de->parent; + if (S_ISDIR(de->mode)) + next->nlink--; + de->nlink = 0; + if (de == root) + break; + pde_put(de); + + spin_lock(&proc_subdir_lock); + de = next; + } + pde_put(root); + return 0; +} +EXPORT_SYMBOL(remove_proc_subtree); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 8307f2f94d8..94dfb2aa553 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -117,6 +117,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, const struct file_operations *proc_fops, void *data); extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); +extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent); struct pid_namespace; @@ -202,6 +203,7 @@ static inline struct proc_dir_entry *proc_create_data(const char *name, return NULL; } #define remove_proc_entry(name, parent) do {} while (0) +#define remove_proc_subtree(name, parent) do {} while (0) static inline struct proc_dir_entry *proc_symlink(const char *name, struct proc_dir_entry *parent,const char *dest) {return NULL;} -- cgit v1.2.3-70-g09d2 From e9c5d8a562f01b211926d70443378eb14b29a676 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Apr 2013 17:33:29 +0400 Subject: mnt: release locks on error path in do_loopback do_loopback calls lock_mount(path) and forget to unlock_mount if clone_mnt or copy_mnt fails. [ 77.661566] ================================================ [ 77.662939] [ BUG: lock held when returning to user space! ] [ 77.664104] 3.9.0-rc5+ #17 Not tainted [ 77.664982] ------------------------------------------------ [ 77.666488] mount/514 is leaving the kernel with locks still held! [ 77.668027] 2 locks held by mount/514: [ 77.668817] #0: (&sb->s_type->i_mutex_key#7){+.+.+.}, at: [] lock_mount+0x32/0xe0 [ 77.671755] #1: (&namespace_sem){+++++.}, at: [] lock_mount+0x4a/0xe0 Signed-off-by: Andrey Vagin Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb4..6c7d31eebba 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,7 +1686,7 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_ERR(mnt)) { err = PTR_ERR(mnt); - goto out; + goto out2; } err = graft_tree(mnt, path); -- cgit v1.2.3-70-g09d2 From eb04e0ac198cec3bab407ad220438dfa65c19c67 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 10 Apr 2013 12:44:18 -0400 Subject: NFSv4: Doh! Typo in the fix to nfs41_walk_client_list Make sure that we set the status to 0 on success. Missed in testing because it never appears when doing multiple mounts to _different_ servers. Signed-off-by: Trond Myklebust Cc: # 3.7.x: 7b1f1fd: NFSv4/4.1: Fix bugs in nfs4[01]_walk_client_list --- fs/nfs/nfs4client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index c7b346f8cc4..66b6664dcd4 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -482,6 +482,7 @@ int nfs41_walk_client_list(struct nfs_client *new, atomic_inc(&pos->cl_count); *result = pos; + status = 0; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); break; -- cgit v1.2.3-70-g09d2 From c369c9a4a7c82d33329d869cbaf93304cc7a0c40 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 9 Apr 2013 18:17:41 +0100 Subject: cifs: Allow passwords which begin with a delimitor Fixes a regression in cifs_parse_mount_options where a password which begins with a delimitor is parsed incorrectly as being a blank password. Signed-off-by: Sachin Prabhu Acked-by: Jeff Layton Cc: Signed-off-by: Steve French --- fs/cifs/connect.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 991c63c6bdd..21b3a291c32 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1575,14 +1575,24 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } break; case Opt_blank_pass: - vol->password = NULL; - break; - case Opt_pass: /* passwords have to be handled differently * to allow the character used for deliminator * to be passed within them */ + /* + * Check if this is a case where the password + * starts with a delimiter + */ + tmp_end = strchr(data, '='); + tmp_end++; + if (!(tmp_end < end && tmp_end[1] == delim)) { + /* No it is not. Set the password to NULL */ + vol->password = NULL; + break; + } + /* Yes it is. Drop down to Opt_pass below.*/ + case Opt_pass: /* Obtain the value string */ value = strchr(data, '='); value++; -- cgit v1.2.3-70-g09d2 From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Apr 2013 09:33:34 +0200 Subject: kthread: Prevent unpark race which puts threads on the wrong cpu The smpboot threads rely on the park/unpark mechanism which binds per cpu threads on a particular core. Though the functionality is racy: CPU0 CPU1 CPU2 unpark(T) wake_up_process(T) clear(SHOULD_PARK) T runs leave parkme() due to !SHOULD_PARK bind_to(CPU2) BUG_ON(wrong CPU) We cannot let the tasks move themself to the target CPU as one of those tasks is actually the migration thread itself, which requires that it starts running on the target cpu right away. The solution to this problem is to prevent wakeups in park mode which are not from unpark(). That way we can guarantee that the association of the task to the target cpu is working correctly. Add a new task state (TASK_PARKED) which prevents other wakeups and use this state explicitly for the unpark wakeup. Peter noticed: Also, since the task state is visible to userspace and all the parked tasks are still in the PID space, its a good hint in ps and friends that these tasks aren't really there for the moment. The migration thread has another related issue. CPU0 CPU1 Bring up CPU2 create_thread(T) park(T) wait_for_completion() parkme() complete() sched_set_stop_task() schedule(TASK_PARKED) The sched_set_stop_task() call is issued while the task is on the runqueue of CPU1 and that confuses the hell out of the stop_task class on that cpu. So we need the same synchronizaion before sched_set_stop_task(). Reported-by: Dave Jones Reported-and-tested-by: Dave Hansen Reported-and-tested-by: Borislav Petkov Acked-by: Peter Ziljstra Cc: Srivatsa S. Bhat Cc: dhillf@gmail.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos Signed-off-by: Thomas Gleixner --- fs/proc/array.c | 1 + include/linux/sched.h | 5 +++-- include/trace/events/sched.h | 2 +- kernel/kthread.c | 52 ++++++++++++++++++++++++-------------------- kernel/smpboot.c | 14 ++++++++++-- 5 files changed, 45 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/proc/array.c b/fs/proc/array.c index f7ed9ee46eb..cbd0f1b324b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,7 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ + "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbf..e692a022527 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_MAX 512 +#define TASK_PARKED 512 +#define TASK_STATE_MAX 1024 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5a8671e8a67..e5586caff67 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "W" }) : "R", + { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9ba..9eb7fed0bba 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) complete(&self->parked); schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); } clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); @@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, state)) { + WARN_ON(1); + return; + } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; @@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - __kthread_bind(p, cpu); + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); @@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) return NULL; } +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +{ + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + wake_up_state(k, TASK_PARKED); + } +} + /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). @@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = task_get_live_kthread(k); - if (kthread) { - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. - */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu); - wake_up_process(k); - } - } + if (kthread) + __kthread_unpark(k, kthread); put_task_struct(k); } @@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 8eaed9aa9cf..02fc5c93367 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) } get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; - if (ht->create) - ht->create(cpu); + if (ht->create) { + /* + * Make sure that the task has actually scheduled out + * into park position, before calling the create + * callback. At least the migration thread callback + * requires that the task is off the runqueue. + */ + if (!wait_task_inactive(tsk, TASK_PARKED)) + WARN_ON(1); + else + ht->create(cpu); + } return 0; } -- cgit v1.2.3-70-g09d2 From 4bc4bee4595662d8bff92180d5c32e3313a704b0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 5 Apr 2013 20:50:09 +0000 Subject: Btrfs: make sure nbytes are right after log replay While trying to track down a tree log replay bug I noticed that fsck was always complaining about nbytes not being right for our fsynced file. That is because the new fsync stuff doesn't wait for ordered extents to complete, so the inodes nbytes are not necessarily updated properly when we log it. So to fix this we need to set nbytes to whatever it is on the inode that is on disk, so when we replay the extents we can just add the bytes that are being added as we replay the extent. This makes it work for the case that we have the wrong nbytes or the case that we logged everything and nbytes is actually correct. With this I'm no longer getting nbytes errors out of btrfsck. Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 451fad96ecd..ef96381569a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -317,6 +317,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, unsigned long src_ptr; unsigned long dst_ptr; int overwrite_root = 0; + bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) overwrite_root = 1; @@ -326,6 +327,9 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, /* look for the key in the destination tree */ ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + if (ret == 0) { char *src_copy; char *dst_copy; @@ -367,6 +371,30 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, return 0; } + /* + * We need to load the old nbytes into the inode so when we + * replay the extents we've logged we get the right nbytes. + */ + if (inode_item) { + struct btrfs_inode_item *item; + u64 nbytes; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + nbytes = btrfs_inode_nbytes(path->nodes[0], item); + item = btrfs_item_ptr(eb, slot, + struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, nbytes); + } + } else if (inode_item) { + struct btrfs_inode_item *item; + + /* + * New inode, set nbytes to 0 so that the nbytes comes out + * properly when we replay the extents. + */ + item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); + btrfs_set_inode_nbytes(eb, item, 0); } insert: btrfs_release_path(path); @@ -486,7 +514,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, int found_type; u64 extent_end; u64 start = key->offset; - u64 saved_nbytes; + u64 nbytes = 0; struct btrfs_file_extent_item *item; struct inode *inode = NULL; unsigned long size; @@ -496,10 +524,19 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(eb, item); if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) - extent_end = start + btrfs_file_extent_num_bytes(eb, item); - else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + nbytes = btrfs_file_extent_num_bytes(eb, item); + extent_end = start + nbytes; + + /* + * We don't add to the inodes nbytes if we are prealloc or a + * hole. + */ + if (btrfs_file_extent_disk_bytenr(eb, item) == 0) + nbytes = 0; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, item); + nbytes = btrfs_file_extent_ram_bytes(eb, item); extent_end = ALIGN(start + size, root->sectorsize); } else { ret = 0; @@ -548,7 +585,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1); BUG_ON(ret); @@ -635,7 +671,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } - inode_set_bytes(inode, saved_nbytes); + inode_add_bytes(inode, nbytes); ret = btrfs_update_inode(trans, root, inode); out: if (inode) -- cgit v1.2.3-70-g09d2 From 5b55d708335a9e3e4f61f2dadf7511502205ccd1 Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Sat, 13 Apr 2013 16:03:06 -0700 Subject: vfs: Revert spurious fix to spinning prevention in prune_icache_sb Revert commit 62a3ddef6181 ("vfs: fix spinning prevention in prune_icache_sb"). This commit doesn't look right: since we are looking at the tail of the list (sb->s_inode_lru.prev) if we want to skip an inode, we should put it back at the head of the list instead of the tail, otherwise we will keep spinning on it. Discovered when investigating why prune_icache_sb came top in perf reports of a swapping load. Signed-off-by: Suleiman Souhlal Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org # v3.2+ Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index f5f7c06c36f..a898b3d43cc 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -725,7 +725,7 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) * inode to the back of the list so we don't spin on it. */ if (!spin_trylock(&inode->i_lock)) { - list_move_tail(&inode->i_lru, &sb->s_inode_lru); + list_move(&inode->i_lru, &sb->s_inode_lru); continue; } -- cgit v1.2.3-70-g09d2 From a2fce9143057f4eb7675a21cca1b6beabe585c8b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 17 Apr 2013 15:58:27 -0700 Subject: hugetlbfs: stop setting VM_DONTDUMP in initializing vma(VM_HUGETLB) Currently we fail to include any data on hugepages into coredump, because VM_DONTDUMP is set on hugetlbfs's vma. This behavior was recently introduced by commit 314e51b9851b ("mm: kill vma flag VM_RESERVED and mm->reserved_vm counter"). This looks to me a serious regression, so let's fix it. Signed-off-by: Naoya Horiguchi Acked-by: Konstantin Khlebnikov Acked-by: Michal Hocko Reviewed-by: Rik van Riel Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 84e3d856e91..523464e6284 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -110,7 +110,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) -- cgit v1.2.3-70-g09d2 From 23d9e482136e31c9d287633a6e473daa172767c4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 17 Apr 2013 15:58:28 -0700 Subject: fs/binfmt_elf.c: fix hugetlb memory check in vma_dump_size() Documentation/filesystems/proc.txt says about coredump_filter bitmask, Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only effected by bit 5-6. However current code can go into the subsequent flag checks of bit 0-4 for vma(VM_HUGETLB). So this patch inserts 'return' and makes it work as written in the document. Signed-off-by: Naoya Horiguchi Reviewed-by: Rik van Riel Acked-by: Michal Hocko Reviewed-by: HATAYAMA Daisuke Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3939829f6c5..86af964c242 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1137,6 +1137,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) goto whole; + return 0; } /* Do not dump I/O mapped devices or special mappings */ -- cgit v1.2.3-70-g09d2 From 12f267a20aecf8b84a2a9069b9011f1661c779b4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 17 Apr 2013 15:58:33 -0700 Subject: hfsplus: fix potential overflow in hfsplus_file_truncate() Change a u32 to loff_t hfsplus_file_truncate(). Signed-off-by: Vyacheslav Dubeyko Cc: Christoph Hellwig Cc: Al Viro Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a94f0f779d5..fe0a76213d9 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -533,7 +533,7 @@ void hfsplus_file_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; res = pagecache_write_begin(NULL, mapping, size, 0, AOP_FLAG_UNINTERRUPTIBLE, -- cgit v1.2.3-70-g09d2 From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 18 Apr 2013 09:00:26 -0700 Subject: Revert "block: add missing block_bio_complete() tracepoint" This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f. Wanlong Gao reports that it causes a kernel panic on his machine several minutes after boot. Reverting it removes the panic. Jens says: "It's not quite clear why that is yet, so I think we should just revert the commit for 3.9 final (which I'm assuming is pretty close). The wifi is crap at the LSF hotel, so sending this email instead of queueing up a revert and pull request." Reported-by: Wanlong Gao Requested-by: Jens Axboe Cc: Tejun Heo Cc: Steven Rostedt Signed-off-by: Linus Torvalds --- block/blk-core.c | 1 + drivers/md/dm.c | 1 + drivers/md/raid5.c | 11 ++++++++++- fs/bio.c | 2 -- include/linux/blktrace_api.h | 1 - include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 26 +++----------------------- 7 files changed, 19 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/block/blk-core.c b/block/blk-core.c index 074b758efc4..7c288358a74 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5..9a0bdad9ad8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24909eb13fe..f4e87bfc756 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4382,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4758,8 +4764,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; diff --git a/fs/bio.c b/fs/bio.c index bb5768f59b3..b96fc6ce485 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - trace_block_bio_complete(bio, error); - if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 0ea61e07a91..7c2e030e72f 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -12,7 +12,6 @@ struct blk_trace { int trace_state; - bool rq_based; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9961726523d..9c1467357b0 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -257,6 +257,7 @@ TRACE_EVENT(block_bio_bounce, /** * block_bio_complete - completed all work on the block operation + * @q: queue holding the block operation * @bio: block operation completed * @error: io error value * @@ -265,9 +266,9 @@ TRACE_EVENT(block_bio_bounce, */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct bio *bio, int error), + TP_PROTO(struct request_queue *q, struct bio *bio, int error), - TP_ARGS(bio, error), + TP_ARGS(q, bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -278,8 +279,7 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev ? - bio->bi_bdev->bd_dev : 0; + __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; __entry->error = error; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272ee..5a0f781cd72 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { - struct blk_trace *bt = q->blk_trace; - - /* if control ever passes through here, it's a request based driver */ - if (unlikely(bt && !bt->rq_based)) - bt->rq_based = true; - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) { - struct request_queue *q; - struct blk_trace *bt; - - if (!bio->bi_bdev) - return; - - q = bdev_get_queue(bio->bi_bdev); - bt = q->blk_trace; - - /* - * Request based drivers will generate both rq and bio completions. - * Ignore bio ones. - */ - if (likely(!bt) || bt->rq_based) - return; - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } -- cgit v1.2.3-70-g09d2 From a0a9434dd50aac5971d63207ff1e25e69c9abdb3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Apr 2013 11:20:35 +0100 Subject: ARM: 7701/1: mm: Allow arch code to control the user page table ceiling On architectures where a pgd entry may be shared between user and kernel (e.g. ARM+LPAE), freeing page tables needs a ceiling other than 0. This patch introduces a generic USER_PGTABLES_CEILING that arch code can override. It is the responsibility of the arch code setting the ceiling to ensure the complete freeing of the page tables (usually in pgd_free()). [catalin.marinas@arm.com: commit log; shift_arg_pages(), asm-generic/pgtables.h changes] Signed-off-by: Hugh Dickins Signed-off-by: Catalin Marinas Cc: Andrew Morton Cc: # 3.3+ Signed-off-by: Russell King --- fs/exec.c | 4 ++-- include/asm-generic/pgtable.h | 10 ++++++++++ mm/mmap.c | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a96a4885bbb..87e731f020f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -613,7 +613,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -622,7 +622,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : 0); + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb, new_end, old_end); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index bfd87685fc1..a59ff51b016 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -7,6 +7,16 @@ #include #include +/* + * On almost all architectures and configurations, 0 can be used as the + * upper ceiling to free_pgtables(): on many architectures it has the same + * effect as using TASK_SIZE. However, there is one configuration which + * must impose a more careful limit, to avoid freeing kernel pgtables. + */ +#ifndef USER_PGTABLES_CEILING +#define USER_PGTABLES_CEILING 0UL +#endif + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, diff --git a/mm/mmap.c b/mm/mmap.c index 2664a47cec9..067ac4d1f29 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2303,7 +2303,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : 0); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -2683,7 +2683,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* -- cgit v1.2.3-70-g09d2