aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-08 06:36:39 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-08 06:36:39 +0900
commit6432f2128414edbea5fd4f6c4fa4c28d0e1c6151 (patch)
treed3c63c5f2f043ce52d98d8dfd3c9c0a7bc76ed95
parent1b033447bf847ba49c3816c564c9191c97456b36 (diff)
parentc278531d39f3158bfee93dc67da0b77e09776de2 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "The big new feature added this time is supporting online resizing using the meta_bg feature. This allows us to resize file systems which are greater than 16TB. In addition, the speed of online resizing has been improved in general. We also fix a number of races, some of which could lead to deadlocks, in ext4's Asynchronous I/O and online defrag support, thanks to good work by Dmitry Monakhov. There are also a large number of more minor bug fixes and cleanups from a number of other ext4 contributors, quite of few of which have submitted fixes for the first time." * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (69 commits) ext4: fix ext4_flush_completed_IO wait semantics ext4: fix mtime update in nodelalloc mode ext4: fix ext_remove_space for punch_hole case ext4: punch_hole should wait for DIO writers ext4: serialize truncate with owerwrite DIO workers ext4: endless truncate due to nonlocked dio readers ext4: serialize unlocked dio reads with truncate ext4: serialize dio nonlocked reads with defrag workers ext4: completed_io locking cleanup ext4: fix unwritten counter leakage ext4: give i_aiodio_unwritten a more appropriate name ext4: ext4_inode_info diet ext4: convert to use leXX_add_cpu() ext4: ext4_bread usage audit fs: reserve fallocate flag codepoint ext4: remove redundant offset check in mext_check_arguments() ext4: don't clear orphan list on ro mount with errors jbd2: fix assertion failure in commit code due to lacking transaction credits ext4: release donor reference when EXT4_IOC_MOVE_EXT ioctl fails ext4: enable FITRIM ioctl on bigalloc file system ...
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext413
-rw-r--r--Documentation/filesystems/ext4.txt10
-rw-r--r--fs/buffer.c13
-rw-r--r--fs/ext4/ext4.h49
-rw-r--r--fs/ext4/extents.c258
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsync.c92
-rw-r--r--fs/ext4/ialloc.c9
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c83
-rw-r--r--fs/ext4/ioctl.c22
-rw-r--r--fs/ext4/mballoc.c129
-rw-r--r--fs/ext4/mballoc.h5
-rw-r--r--fs/ext4/move_extent.c520
-rw-r--r--fs/ext4/namei.c105
-rw-r--r--fs/ext4/page-io.c176
-rw-r--r--fs/ext4/resize.c432
-rw-r--r--fs/ext4/super.c92
-rw-r--r--fs/fs-writeback.c1
-rw-r--r--fs/jbd2/commit.c40
-rw-r--r--fs/jbd2/journal.c5
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c65
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--include/linux/falloc.h1
-rw-r--r--include/trace/events/ext4.h242
26 files changed, 1498 insertions, 896 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index f22ac0872ae..c631253cf85 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -96,3 +96,16 @@ Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
The maximum number of megabytes the writeback code will
try to write out before move on to another inode.
+
+What: /sys/fs/ext4/<disk>/extent_max_zeroout_kb
+Date: August 2012
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The maximum number of kilobytes which will be zeroed
+ out in preference to creating a new uninitialized
+ extent when manipulating an inode's extent tree. Note
+ that using a larger value will increase the
+ variability of time necessary to complete a random
+ write operation (since a 4k random write might turn
+ into a much larger write due to the zeroout
+ operation).
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 1b7f9acbcbb..104322bf378 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -375,6 +375,16 @@ dioread_nolock locking. If the dioread_nolock option is specified
Because of the restrictions this options comprises
it is off by default (e.g. dioread_lock).
+max_dir_size_kb=n This limits the size of directories so that any
+ attempt to expand them beyond the specified
+ limit in kilobytes will cause an ENOSPC error.
+ This is useful in memory constrained
+ environments, where a very large directory can
+ cause severe performance problems or even
+ provoke the Out Of Memory killer. (For example,
+ if there is only 512mb memory available, a 176mb
+ directory may seriously cramp the system's style.)
+
i_version Enable 64-bit inode version support. This option is
off by default.
diff --git a/fs/buffer.c b/fs/buffer.c
index 58e2e7b7737..b5f044283ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2312,12 +2312,6 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
loff_t size;
int ret;
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
@@ -2355,6 +2349,13 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
sb_start_pagefault(sb);
+
+ /*
+ * Update file times before taking page lock. We may end up failing the
+ * fault so this update may be superfluous but who really cares...
+ */
+ file_update_time(vma->vm_file);
+
ret = __block_page_mkwrite(vma, vmf, get_block);
sb_end_pagefault(sb);
return block_page_mkwrite_return(ret);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c3411d4ce2d..3ab2539b7b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -186,7 +186,6 @@ struct mpage_da_data {
#define EXT4_IO_END_ERROR 0x0002
#define EXT4_IO_END_QUEUED 0x0004
#define EXT4_IO_END_DIRECT 0x0008
-#define EXT4_IO_END_IN_FSYNC 0x0010
struct ext4_io_page {
struct page *p_page;
@@ -912,9 +911,7 @@ struct ext4_inode_info {
struct list_head i_completed_io_list;
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
- /* current io_end structure for async DIO write*/
- ext4_io_end_t *cur_aio_dio;
- atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
+ atomic_t i_unwritten; /* Nr. of inflight conversions pending */
spinlock_t i_block_reservation_lock;
@@ -1233,6 +1230,7 @@ struct ext4_sb_info {
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
+ unsigned int s_group_info_size;
/* tunables */
unsigned long s_stripe;
@@ -1243,6 +1241,7 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_writeback_mb_bump;
+ unsigned int s_max_dir_size_kb;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -1270,8 +1269,12 @@ struct ext4_sb_info {
unsigned long s_sectors_written_start;
u64 s_kbytes_written;
+ /* the size of zero-out chunk */
+ unsigned int s_extent_max_zeroout_kb;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+ ext4_group_t s_flex_groups_allocated;
/* workqueue for dio unwritten */
struct workqueue_struct *dio_unwritten_wq;
@@ -1328,10 +1331,20 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
{
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+ atomic_inc(&EXT4_I(inode)->i_unwritten);
}
}
+static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
+{
+ return inode->i_private;
+}
+
+static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
+{
+ inode->i_private = io;
+}
+
/*
* Inode dynamic state flags
*/
@@ -1345,6 +1358,8 @@ enum {
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
EXT4_STATE_NEWENTRY, /* File just added to dir */
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
+ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
+ nolocking */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1932,7 +1947,7 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
-extern int ext4_flush_completed_IO(struct inode *);
+extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1966,6 +1981,8 @@ extern void ext4_exit_mballoc(void);
extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
struct buffer_head *bh, ext4_fsblk_t block,
unsigned long count, int flags);
+extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
+ ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
@@ -2051,6 +2068,8 @@ extern void ext4_superblock_csum_set(struct super_block *sb,
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
extern void ext4_kvfree(void *ptr);
+extern int ext4_alloc_flex_bg_array(struct super_block *sb,
+ ext4_group_t ngroup);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
const char *, ...);
@@ -2352,6 +2371,7 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
+extern void ext4_unwritten_wait(struct inode *inode);
/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
@@ -2400,11 +2420,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
+extern void ext4_add_complete_io(ext4_io_end_t *io_end);
extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
-extern int ext4_end_io_nolock(ext4_io_end_t *io);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2452,6 +2472,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+ ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+ smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+ smp_mb();
+ ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
/* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index aabbb3f5368..1c94cca35ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1177,7 +1177,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
- neh->eh_depth = cpu_to_le16(le16_to_cpu(neh->eh_depth) + 1);
+ le16_add_cpu(&neh->eh_depth, 1);
ext4_mark_inode_dirty(handle, inode);
out:
brelse(bh);
@@ -1656,16 +1656,60 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
}
/*
+ * This function does a very simple check to see if we can collapse
+ * an extent tree with a single extent tree leaf block into the inode.
+ */
+static void ext4_ext_try_to_merge_up(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ size_t s;
+ unsigned max_root = ext4_ext_space_root(inode, 0);
+ ext4_fsblk_t blk;
+
+ if ((path[0].p_depth != 1) ||
+ (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
+ (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
+ return;
+
+ /*
+ * We need to modify the block allocation bitmap and the block
+ * group descriptor to release the extent tree block. If we
+ * can't get the journal credits, give up.
+ */
+ if (ext4_journal_extend(handle, 2))
+ return;
+
+ /*
+ * Copy the extent data up to the inode
+ */
+ blk = ext4_idx_pblock(path[0].p_idx);
+ s = le16_to_cpu(path[1].p_hdr->eh_entries) *
+ sizeof(struct ext4_extent_idx);
+ s += sizeof(struct ext4_extent_header);
+
+ memcpy(path[0].p_hdr, path[1].p_hdr, s);
+ path[0].p_depth = 0;
+ path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
+ (path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
+ path[0].p_hdr->eh_max = cpu_to_le16(max_root);
+
+ brelse(path[1].p_bh);
+ ext4_free_blocks(handle, inode, NULL, blk, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
+}
+
+/*
* This function tries to merge the @ex extent to neighbours in the tree.
* return 1 if merge left else 0.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static void ext4_ext_try_to_merge(handle_t *handle,
+ struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex) {
struct ext4_extent_header *eh;
unsigned int depth;
int merge_done = 0;
- int ret = 0;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1675,9 +1719,9 @@ static int ext4_ext_try_to_merge(struct inode *inode,
merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
if (!merge_done)
- ret = ext4_ext_try_to_merge_right(inode, path, ex);
+ (void) ext4_ext_try_to_merge_right(inode, path, ex);
- return ret;
+ ext4_ext_try_to_merge_up(handle, inode, path);
}
/*
@@ -1893,7 +1937,7 @@ has_space:
merge:
/* try to merge extents */
if (!(flag & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, nearex);
+ ext4_ext_try_to_merge(handle, inode, path, nearex);
/* time to correct all indexes above */
@@ -1901,7 +1945,7 @@ merge:
if (err)
goto cleanup;
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
cleanup:
if (npath) {
@@ -2092,13 +2136,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
}
/*
- * ext4_ext_check_cache()
+ * ext4_ext_in_cache()
* Checks to see if the given block is in the cache.
* If it is, the cached extent is stored in the given
- * cache extent pointer. If the cached extent is a hole,
- * this routine should be used instead of
- * ext4_ext_in_cache if the calling function needs to
- * know the size of the hole.
+ * cache extent pointer.
*
* @inode: The files inode
* @block: The block to look for in the cache
@@ -2107,8 +2148,10 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
*
* Return 0 if cache is invalid; 1 if the cache is valid
*/
-static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_ext_cache *ex){
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
struct ext4_ext_cache *cex;
struct ext4_sb_info *sbi;
int ret = 0;
@@ -2125,7 +2168,9 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) {
- memcpy(ex, cex, sizeof(struct ext4_ext_cache));
+ ex->ee_block = cpu_to_le32(cex->ec_block);
+ ext4_ext_store_pblock(ex, cex->ec_start);
+ ex->ee_len = cpu_to_le16(cex->ec_len);
ext_debug("%u cached by %u:%u:%llu\n",
block,
cex->ec_block, cex->ec_len, cex->ec_start);
@@ -2138,37 +2183,6 @@ errout:
}
/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache cex;
- int ret = 0;
-
- if (ext4_ext_check_cache(inode, block, &cex)) {
- ex->ee_block = cpu_to_le32(cex.ec_block);
- ext4_ext_store_pblock(ex, cex.ec_start);
- ex->ee_len = cpu_to_le16(cex.ec_len);
- ret = 1;
- }
-
- return ret;
-}
-
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
@@ -2274,10 +2288,13 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned short ee_len = ext4_ext_get_actual_len(ex);
ext4_fsblk_t pblk;
- int flags = EXT4_FREE_BLOCKS_FORGET;
+ int flags = 0;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- flags |= EXT4_FREE_BLOCKS_METADATA;
+ flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
+ else if (ext4_should_journal_data(inode))
+ flags |= EXT4_FREE_BLOCKS_FORGET;
+
/*
* For bigalloc file systems, we never free a partial cluster
* at the beginning of the extent. Instead, we make a note
@@ -2572,7 +2589,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct ext4_ext_path *path = NULL;
ext4_fsblk_t partial_cluster = 0;
handle_t *handle;
- int i = 0, err;
+ int i = 0, err = 0;
ext_debug("truncate since %u to %u\n", start, end);
@@ -2604,12 +2621,16 @@ again:
return PTR_ERR(path);
}
depth = ext_depth(inode);
+ /* Leaf not may not exist only if inode has no blocks at all */
ex = path[depth].p_ext;
if (!ex) {
- ext4_ext_drop_refs(path);
- kfree(path);
- path = NULL;
- goto cont;
+ if (depth) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_hdr == NULL",
+ depth);
+ err = -EIO;
+ }
+ goto out;
}
ee_block = le32_to_cpu(ex->ee_block);
@@ -2641,8 +2662,6 @@ again:
goto out;
}
}
-cont:
-
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
@@ -2924,9 +2943,9 @@ static int ext4_split_extent_at(handle_t *handle,
ext4_ext_mark_initialized(ex);
if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -2958,8 +2977,8 @@ static int ext4_split_extent_at(handle_t *handle,
goto fix_extent_len;
/* update the extent length and mark as initialized */
ex->ee_len = cpu_to_le16(ee_len);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
} else if (err)
goto fix_extent_len;
@@ -3041,7 +3060,6 @@ out:
return err ? err : map->m_len;
}
-#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -3067,13 +3085,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_sb_info *sbi;
struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
unsigned int ee_len, depth;
- int allocated;
+ int allocated, max_zeroout = 0;
int err = 0;
int split_flag = 0;
@@ -3081,6 +3100,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
"block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len);
+ sbi = EXT4_SB(inode->i_sb);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
@@ -3180,9 +3200,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (EXT4_EXT_MAY_ZEROOUT & split_flag)
+ max_zeroout = sbi->s_extent_max_zeroout_kb >>
+ inode->i_sb->s_blocksize_bits;
+
+ /* If extent is less than s_max_zeroout_kb, zeroout directly */
+ if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
if (err)
goto out;
@@ -3191,8 +3214,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (err)
goto out;
ext4_ext_mark_initialized(ex);
- ext4_ext_try_to_merge(inode, path, ex);
- err = ext4_ext_dirty(handle, inode, path + depth);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
goto out;
}
@@ -3206,9 +3229,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
split_map.m_lblk = map->m_lblk;
split_map.m_len = map->m_len;
- if (allocated > map->m_len) {
- if (allocated <= EXT4_EXT_ZERO_LEN &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ if (max_zeroout && (allocated > map->m_len)) {
+ if (allocated <= max_zeroout) {
/* case 3 */
zero_ex.ee_block =
cpu_to_le32(map->m_lblk);
@@ -3220,9 +3242,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
split_map.m_lblk = map->m_lblk;
split_map.m_len = allocated;
- } else if ((map->m_lblk - ee_block + map->m_len <
- EXT4_EXT_ZERO_LEN) &&
- (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ } else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
/* case 2 */
if (map->m_lblk != ee_block) {
zero_ex.ee_block = ex->ee_block;
@@ -3242,7 +3262,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
}
allocated = ext4_split_extent(handle, inode, path,
- &split_map, split_flag, 0);
+ &split_map, split_flag, 0);
if (allocated < 0)
err = allocated;
@@ -3256,7 +3276,7 @@ out:
* to an uninitialized extent.
*
* Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple /initialized uninitialized extents (up to three)
+ * extent into multiple initialized/uninitialized extents (up to three)
* There are three possibilities:
* a> There is no split required: Entire extent should be uninitialized
* b> Splits in two extents: Write is happening at either end of the extent
@@ -3333,10 +3353,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
/* note: ext4_ext_correct_indexes() isn't needed here because
* borders are not changed
*/
- ext4_ext_try_to_merge(inode, path, ex);
+ ext4_ext_try_to_merge(handle, inode, path, ex);
/* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
out:
ext4_ext_show_leaf(inode, path);
return err;
@@ -3600,7 +3620,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
{
int ret = 0;
int err = 0;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -3615,6 +3635,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
ret = ext4_split_unwritten_extents(handle, inode, map,
path, flags);
+ if (ret <= 0)
+ goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
@@ -3858,8 +3880,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
- ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
+ int set_unwritten = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
@@ -4082,13 +4105,8 @@ got_allocated_blocks:
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- if (io)
- ext4_set_io_unwritten_flag(inode, io);
- else
- ext4_set_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN);
- }
+ if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ set_unwritten = 1;
if (ext4_should_dioread_nolock(inode))
map->m_flags |= EXT4_MAP_UNINIT;
}
@@ -4100,6 +4118,15 @@ got_allocated_blocks:
if (!err)
err = ext4_ext_insert_extent(handle, inode, path,
&newex, flags);
+
+ if (!err && set_unwritten) {
+ if (io)
+ ext4_set_io_unwritten_flag(inode, io);
+ else
+ ext4_set_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+
if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
@@ -4241,7 +4268,7 @@ void ext4_ext_truncate(struct inode *inode)
* finish any pending end_io work so we won't run the risk of
* converting any truncated blocks to initialized later
*/
- ext4_flush_completed_IO(inode);
+ ext4_flush_unwritten_io(inode);
/*
* probably first extent we're gonna free will be last in block
@@ -4769,9 +4796,32 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
loff_t first_page_offset, last_page_offset;
int credits, err = 0;
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
- return 0;
+ goto out_mutex;
/*
* If the hole extends beyond i_size, set the hole
@@ -4789,35 +4839,26 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
first_page_offset = first_page << PAGE_CACHE_SHIFT;
last_page_offset = last_page << PAGE_CACHE_SHIFT;
- /*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
- */
- if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- err = filemap_write_and_wait_range(mapping,
- offset, offset + length - 1);
-
- if (err)
- return err;
- }
-
/* Now release the pages */
if (last_page_offset > first_page_offset) {
truncate_pagecache_range(inode, first_page_offset,
last_page_offset - 1);
}
- /* finish any pending end_io work */
- ext4_flush_completed_IO(inode);
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ err = ext4_flush_unwritten_io(inode);
+ if (err)
+ goto out_dio;
+ inode_dio_wait(inode);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_dio;
+ }
- err = ext4_orphan_add(handle, inode);
- if (err)
- goto out;
/*
* Now we need to zero out the non-page-aligned data in the
@@ -4903,10 +4944,13 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
up_write(&EXT4_I(inode)->i_data_sem);
out:
- ext4_orphan_del(handle, inode);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
return err;
}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3b0e3bdaabf..ca6f07afe60 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,11 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-static void ext4_aiodio_wait(struct inode *inode)
+void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
- wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
}
/*
@@ -116,7 +116,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
"performance will be poor.",
inode->i_ino, current->comm);
mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
+ ext4_unwritten_wait(inode);
}
BUG_ON(iocb->ki_pos != pos);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2a1dcea4f12..be1d89f385b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -34,87 +34,6 @@
#include <trace/events/ext4.h>
-static void dump_completed_IO(struct inode * inode)
-{
-#ifdef EXT4FS_DEBUG
- struct list_head *cur, *before, *after;
- ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
-
- if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
- ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
- return;
- }
-
- ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
- spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
- list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
- cur = &io->list;
- before = cur->prev;
- io0 = container_of(before, ext4_io_end_t, list);
- after = cur->next;
- io1 = container_of(after, ext4_io_end_t, list);
-
- ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
- io, inode->i_ino, io0, io1);
- }