From 357b66fdc8ad4cea6e6336956a70742f961f0a4d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:34:34 -0500 Subject: ext4: ext4_split_extent should take care of extent zeroout When ext4_split_extent_at() ends up doing zeroout & conversion to initialized instead of split & conversion, ext4_split_extent() gets confused and can wrongly mark the extent back as uninitialized resulting in end IO code getting confused from large unwritten extents and may result in data loss. The example of problematic behavior is: lblk len lblk len ext4_split_extent() (ex=[1000,30,uninit], map=[1010,10]) ext4_split_extent_at() (split [1000,30,uninit] at 1020) ext4_ext_insert_extent() -> ENOSPC ext4_ext_zeroout() -> extent [1000,30] is now initialized ext4_split_extent_at() (split [1000,30,init] at 1010, MARK_UNINIT1 | MARK_UNINIT2) -> extent is split and parts marked as uninitialized Fix the problem by rechecking extent type after the first ext4_split_extent_at() returns. None of split_flags can not be applied to initialized extent so this patch also add BUG_ON to prevent similar issues in future. TESTCASE: https://github.com/dmonakhov/xfstests/commit/b8a55eb5ce28c6ff29e620ab090902fcd5833597 Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 372b2cbee07..bef194a1443 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2943,6 +2943,10 @@ static int ext4_split_extent_at(handle_t *handle, newblock = split - ee_block + ext4_ext_pblock(ex); BUG_ON(split < ee_block || split >= (ee_block + ee_len)); + BUG_ON(!ext4_ext_is_uninitialized(ex) && + split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT1 | + EXT4_EXT_MARK_UNINIT2)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3061,19 +3065,26 @@ static int ext4_split_extent(handle_t *handle, if (err) goto out; } - + /* + * Update path is required because previous ext4_split_extent_at() may + * result in split of original leaf or extent zeroout. + */ ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, path); if (IS_ERR(path)) return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + uninitialized = ext4_ext_is_uninitialized(ex); + split_flag1 = 0; if (map->m_lblk >= ee_block) { - split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT | - EXT4_EXT_DATA_VALID2); - if (uninitialized) + split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; + if (uninitialized) { split_flag1 |= EXT4_EXT_MARK_UNINIT1; - if (split_flag & EXT4_EXT_MARK_UNINIT2) - split_flag1 |= EXT4_EXT_MARK_UNINIT2; + split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | + EXT4_EXT_MARK_UNINIT2); + } err = ext4_split_extent_at(handle, inode, path, map->m_lblk, split_flag1, flags); if (err) -- cgit v1.2.3-18-g5258 From ec22ba8edb507395c95fbc617eea26a6b2d98797 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:36:06 -0500 Subject: ext4: disable merging of uninitialized extents Derived from Jan's patch:http://permalink.gmane.org/gmane.comp.file-systems.ext4/36470 Merging of uninitialized extents creates all sorts of interesting race possibilities when writeback / DIO races with fallocate. Thus ext4_convert_unwritten_extents_endio() has to deal with a case where extent to be converted needs to be split out first. That isn't nice for two reasons: 1) It may need allocation of extent tree block so ENOSPC is possible. 2) It complicates end_io handling code So we disable merging of uninitialized extents which allows us to simplify the code. Extents will get merged after they are converted to initialized ones. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bef194a1443..60818ed1f6a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1584,10 +1584,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, unsigned short ext1_ee_len, ext2_ee_len, max_len; /* - * Make sure that either both extents are uninitialized, or - * both are _not_. + * Make sure that both extents are initialized. We don't merge + * uninitialized extents so that we can be sure that end_io code has + * the extent that was written properly split out and conversion to + * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) return 0; if (ext4_ext_is_uninitialized(ex1)) -- cgit v1.2.3-18-g5258 From ff95ec22cd7faa0d8b58dcc4207f21502df7b00b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 4 Mar 2013 00:41:05 -0500 Subject: ext4: add warning to ext4_convert_unwritten_extents_endio Splitting extents inside endio is a bad thing, but unfortunately it is still possible. In fact we are pretty close to the moment when all related issues will be fixed. Let's warn developer if it still the case. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 60818ed1f6a..265cb0e50c5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3387,8 +3387,19 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, "block %llu, max_blocks %u\n", inode->i_ino, (unsigned long long)ee_block, ee_len); - /* If extent is larger than requested then split is required */ + /* If extent is larger than requested it is a clear sign that we still + * have some extent state machine issues left. So extent_split is still + * required. + * TODO: Once all related issues will be fixed this situation should be + * illegal. + */ if (ee_block != map->m_lblk || ee_len > map->m_len) { +#ifdef EXT4_DEBUG + ext4_warning("Inode (%ld) finished: extent logical block %llu," + " len %u; IO logical block %llu, len %u\n", + inode->i_ino, (unsigned long long)ee_block, ee_len, + (unsigned long long)map->m_lblk, map->m_len); +#endif err = ext4_split_unwritten_extents(handle, inode, map, path, EXT4_GET_BLOCKS_CONVERT); if (err < 0) -- cgit v1.2.3-18-g5258 From de99fcce1da7933a90198b80a2e896754ea3bdc8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 4 Mar 2013 00:43:32 -0500 Subject: ext4: remove unnecessary wait for extent conversion in ext4_fallocate() Now that we don't merge uninitialized extents anymore, ext4_fallocate() is free to operate on the inode while there are still some extent conversions pending - it won't disturb them in any way. Reviewed-by: Zheng Liu Reviewed-by: Dmitry Monakhov Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 265cb0e50c5..25c86aaa38d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4392,8 +4392,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (len <= EXT_UNINIT_MAX_LEN << blkbits) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Prevent race condition between unwritten */ - ext4_flush_unwritten_io(inode); retry: while (ret >= 0 && ret < max_blocks) { map.m_lblk = map.m_lblk + ret; -- cgit v1.2.3-18-g5258 From cdee78433c138c2f2018a6884673739af2634787 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:08:52 -0400 Subject: ext4: fix wrong m_len value after unwritten extent conversion The ext4_ext_handle_uninitialized_extents() function was assuming the return value of ext4_ext_map_blocks() is equal to map->m_len. This incorrect assumption was harmless until we started use status tree as a extent cache because we need to update status tree according to 'm_len' value. Meanwhile this commit marks EXT4_MAP_MAPPED flag after unwritten extent conversion. It shouldn't cause a bug because we update status tree according to checking EXT4_MAP_UNWRITTEN flag. But it should be fixed. After applied this commit, the following error message from self-testing infrastructure disappears. ... kernel: ES len assertation failed for inode: 230 retval 1 != map->m_len 3 in ext4_map_blocks (allocation) ... Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 25c86aaa38d..110e85a1f82 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3650,6 +3650,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, path, map->m_len); } else err = ret; + map->m_flags |= EXT4_MAP_MAPPED; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; goto out2; } /* buffered IO case */ -- cgit v1.2.3-18-g5258 From adb2355104b2109e06ba5276485d187d023b2fd2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:13:05 -0400 Subject: ext4: update extent status tree after an extent is zeroed out When we try to split an extent, this extent could be zeroed out and mark as initialized. But we don't know this in ext4_map_blocks because it only returns a length of allocated extent. Meanwhile we will mark this extent as uninitialized because we only check m_flags. This commit update extent status tree when we try to split an unwritten extent. We don't need to worry about the status of this extent because we always mark it as initialized. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov --- fs/ext4/extents.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 110e85a1f82..7e37018d175 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2925,7 +2925,7 @@ static int ext4_split_extent_at(handle_t *handle, { ext4_fsblk_t newblock; ext4_lblk_t ee_block; - struct ext4_extent *ex, newex, orig_ex; + struct ext4_extent *ex, newex, orig_ex, zero_ex; struct ext4_extent *ex2 = NULL; unsigned int ee_len, depth; int err = 0; @@ -2996,12 +2996,26 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { - if (split_flag & EXT4_EXT_DATA_VALID1) + if (split_flag & EXT4_EXT_DATA_VALID1) { err = ext4_ext_zeroout(inode, ex2); - else + zero_ex.ee_block = ex2->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex2); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex2)); + } else { err = ext4_ext_zeroout(inode, ex); - } else + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(ex)); + } + } else { err = ext4_ext_zeroout(inode, &orig_ex); + zero_ex.ee_block = orig_ex.ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(&orig_ex); + ext4_ext_store_pblock(&zero_ex, + ext4_ext_pblock(&orig_ex)); + } if (err) goto fix_extent_len; @@ -3009,6 +3023,12 @@ static int ext4_split_extent_at(handle_t *handle, ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(handle, inode, path, ex); err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + goto fix_extent_len; + + /* update extent status tree */ + err = ext4_es_zeroout(inode, &zero_ex); + goto out; } else if (err) goto fix_extent_len; @@ -3150,6 +3170,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); allocated = ee_len - (map->m_lblk - ee_block); + zero_ex.ee_len = 0; trace_ext4_ext_convert_to_initialized_enter(inode, map, ex); @@ -3247,6 +3268,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = ext4_ext_zeroout(inode, ex); if (err) goto out; + zero_ex.ee_block = ex->ee_block; + zero_ex.ee_len = ext4_ext_get_actual_len(ex); + ext4_ext_store_pblock(&zero_ex, ext4_ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) @@ -3305,6 +3329,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = allocated; out: + /* If we have gotten a failure, don't zero out status tree */ + if (!err) + err = ext4_es_zeroout(inode, &zero_ex); return err ? err : allocated; } -- cgit v1.2.3-18-g5258 From 3a2256702e47f68f921dfad41b1764d05c572329 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sun, 10 Mar 2013 21:20:23 -0400 Subject: ext4: fix the wrong number of the allocated blocks in ext4_split_extent() This commit fixes a wrong return value of the number of the allocated blocks in ext4_split_extent. When the length of blocks we want to allocate is greater than the length of the current extent, we return a wrong number. Let's see what happens in the following case when we call ext4_split_extent(). map: [48, 72] ex: [32, 64, u] 'ex' will be split into two parts: ex1: [32, 47, u] ex2: [48, 64, w] 'map->m_len' is returned from this function, and the value is 24. But the real length is 16. So it should be fixed. Meanwhile in this commit we use right length of the allocated blocks when get_reserved_cluster_alloc in ext4_ext_handle_uninitialized_extents is called. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e37018d175..69df02ff96a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3067,6 +3067,7 @@ static int ext4_split_extent(handle_t *handle, int err = 0; int uninitialized; int split_flag1, flags1; + int allocated = map->m_len; depth = ext_depth(inode); ex = path[depth].p_ext; @@ -3086,6 +3087,8 @@ static int ext4_split_extent(handle_t *handle, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; + } else { + allocated = ee_len - (map->m_lblk - ee_block); } /* * Update path is required because previous ext4_split_extent_at() may @@ -3115,7 +3118,7 @@ static int ext4_split_extent(handle_t *handle, ext4_ext_show_leaf(inode, path); out: - return err ? err : map->m_len; + return err ? err : allocated; } /* @@ -3730,6 +3733,7 @@ out: allocated - map->m_len); allocated = map->m_len; } + map->m_len = allocated; /* * If we have done fallocate with the offset that is already -- cgit v1.2.3-18-g5258 From 232ec8720d4e45405e37144c67053042c6b886d3 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sun, 10 Mar 2013 22:46:30 -0400 Subject: ext4: update reserved space after the 'correction' Currently in ext4_ext_map_blocks() in delayed allocation writeback we would update the reservation and after that check whether we claimed cluster outside of the range of the allocation and if so, we'll give the block back to the reservation pool. However this also means that if the number of reserved data block dropped to zero before the correction, we would release all the metadata reservation as well, however we might still need it because the we're not done with the delayed allocation and there might be more blocks to come. This will result in error messages such as: EXT4-fs warning (device sdb): ext4_da_update_reserve_space:361: ino 12, allocated 1 with only 0 reserved metadata blocks (releasing 1 blocks with reserved 1 data blocks) This will only happen on bigalloc file system and it can be easily reproduced using fiemap-tester from xfstests like this: ./src/fiemap-tester -m DHDHDHDHD -S -p0 /mnt/test/file Or using xfstests such as 225. Fix this by doing the correction first and updating the reservation after that so that we do not accidentally decrease i_reserved_data_blocks to zero. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 69df02ff96a..bd69e906bd9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4165,9 +4165,6 @@ got_allocated_blocks: } } else { BUG_ON(allocated_clusters < reserved_clusters); - /* We will claim quota for all newly allocated blocks.*/ - ext4_da_update_reserve_space(inode, allocated_clusters, - 1); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); int reservation = allocated_clusters - @@ -4218,6 +4215,15 @@ got_allocated_blocks: ei->i_reserved_data_blocks += reservation; spin_unlock(&ei->i_block_reservation_lock); } + /* + * We will claim quota for all newly allocated blocks. + * We're updating the reserved space *after* the + * correction above so we do not accidentally free + * all the metadata reservation because we might + * actually need it later on. + */ + ext4_da_update_reserve_space(inode, allocated_clusters, + 1); } } -- cgit v1.2.3-18-g5258 From 4f42f80a8f08d4c3f52c4267361241885d5dee3a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 12 Mar 2013 12:40:04 -0400 Subject: ext4: use s_extent_max_zeroout_kb value as number of kb Currently when converting extent to initialized, we have to decide whether to zeroout part/all of the uninitialized extent in order to avoid extent tree growing rapidly. The decision is made by comparing the size of the extent with the configurable value s_extent_max_zeroout_kb which is in kibibytes units. However when converting it to number of blocks we currently use it as it was in bytes. This is obviously bug and it will result in ext4 _never_ zeroout extents, but rather always split and convert parts to initialized while leaving the rest uninitialized in default setting. Fix this by using s_extent_max_zeroout_kb as kibibytes. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/extents.c') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bd69e906bd9..e2bb929bea9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3264,7 +3264,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, if (EXT4_EXT_MAY_ZEROOUT & split_flag) max_zeroout = sbi->s_extent_max_zeroout_kb >> - inode->i_sb->s_blocksize_bits; + (inode->i_sb->s_blocksize_bits - 10); /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { -- cgit v1.2.3-18-g5258