From df981d03eeff7971ac7e6ff37000bfa702327ef1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 09:48:17 -0400 Subject: ext4: add max_dir_size_kb mount option Very large directories can cause significant performance problems, or perhaps even invoke the OOM killer, if the process is running in a highly constrained memory environment (whether it is VM's with a small amount of memory or in a small memory cgroup). So it is useful, in cloud server/data center environments, to be able to set a filesystem-wide cap on the maximum size of a directory, to ensure that directories never get larger than a sane size. We do this via a new mount option, max_dir_size_kb. If there is an attempt to grow the directory larger than max_dir_size_kb, the system call will return ENOSPC instead. Google-Bug-Id: 6863013 Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 59849890403..5a97e590692 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1230,6 +1230,7 @@ enum { Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, + Opt_max_dir_size_kb, }; static const match_table_t tokens = { @@ -1303,6 +1304,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1483,6 +1485,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, + {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1598,6 +1601,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, if (!args->from) arg = EXT4_DEF_LI_WAIT_MULT; sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; } else if (token == Opt_stripe) { sbi->s_stripe = arg; } else if (m->flags & MOPT_DATAJ) { @@ -1829,6 +1834,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || (test_opt(sb, INIT_INODE_TABLE) && (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); + if (nodefs || sbi->s_max_dir_size_kb) + SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); ext4_show_quota_options(seq, sb); return 0; -- cgit v1.2.3-18-g5258 From 67a5da564f97f31c4054d358e00b34d7ee570da5 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 17 Aug 2012 09:54:17 -0400 Subject: ext4: make the zero-out chunk size tunable Currently in ext4 the length of zero-out chunk is set to 7 file system blocks. But if an inode has uninitailized extents from using fallocate to preallocate space, and the workload issues many random writes, this can cause a fragmented extent tree that will unnecessarily grow the extent tree. So create a new sysfs tunable, extent_max_zeroout_kb, which controls the maximum size where blocks will be zeroed out instead of creating a new uninitialized extent. The default of this has been sent to 32kb. CC: Zach Brown CC: Andreas Dilger Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a97e590692..0423e2e7f61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); static struct attribute *ext4_attrs[] = { @@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), NULL, }; @@ -3756,6 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; + sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode -- cgit v1.2.3-18-g5258 From 0e376b1e3ccedee49cb8cc6b652fbc1e7c15eeef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 10:04:17 -0400 Subject: ext4: return an error if kset_create_and_add fails in ext4_init_fs() In the very unlikely case that kset_create_and_add() fails when the ext4.ko module is being loaded (or during kernel startup) set err so that it's clear that the module load failed. https://bugzilla.kernel.org/show_bug.cgi?id=27912 Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0423e2e7f61..3ab798d00f0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5285,8 +5285,10 @@ static int __init ext4_init_fs(void) if (err) goto out6; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) + if (!ext4_kset) { + err = -ENOMEM; goto out5; + } ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = ext4_init_feat_adverts(); -- cgit v1.2.3-18-g5258 From 07724f98978ad39386a3996a4e1b6780489a4dda Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 17 Aug 2012 19:08:42 -0400 Subject: ext4: drop lock_super()/unlock_super() We don't need lock_super()/unlock_super() any more, since the places where it is used, we are protected by the s_umount r/w semaphore. Signed-off-by: "Theodore Ts'o" Cc: Marco Stornelli --- fs/ext4/super.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ab798d00f0..bae4124e147 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -861,7 +861,6 @@ static void ext4_put_super(struct super_block *sb) flush_workqueue(sbi->dio_unwritten_wq); destroy_workqueue(sbi->dio_unwritten_wq); - lock_super(sb); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -928,7 +927,6 @@ static void ext4_put_super(struct super_block *sb) * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ - unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) @@ -4535,11 +4533,9 @@ static int ext4_unfreeze(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return 0; - lock_super(sb); /* Reset the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); - unlock_super(sb); return 0; } @@ -4575,7 +4571,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) char *orig_data = kstrdup(data, GFP_KERNEL); /* Store the original options */ - lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; @@ -4717,7 +4712,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); - unlock_super(sb); #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -4730,10 +4724,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { err = ext4_enable_quotas(sb); - if (err) { - lock_super(sb); + if (err) goto restore_opts; - } } } #endif @@ -4760,7 +4752,6 @@ restore_opts: sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif - unlock_super(sb); kfree(orig_data); return err; } -- cgit v1.2.3-18-g5258 From caecd0af8fe0635e8e6900399b951433af35bf52 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Sat, 18 Aug 2012 22:29:18 -0400 Subject: ext4: replace plain integer with NULL in super.c Fixes the following sparse warning: fs/ext4/super.c:1672:45: warning: Using plain integer as NULL pointer Signed-off-by: Sachin Kamat Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bae4124e147..b875ff55586 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1673,7 +1673,7 @@ static int parse_options(char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); if (handle_mount_opt(sb, p, token, args, journal_devnum, journal_ioprio, is_remount) < 0) -- cgit v1.2.3-18-g5258 From 117fff10d7f140e12dd43df20d3f9fda80577460 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Sep 2012 01:29:50 -0400 Subject: ext4: grow the s_flex_groups array as needed when resizing Previously, we allocated the s_flex_groups array to the maximum size that the file system could be resized. There was two problems with this approach. First, it wasted memory in the common case where the file system was not resized. Secondly, once we start allowing online resizing using the meta_bg scheme, there is no maximum size that the file system can be resized. So instead, we need to grow the s_flex_groups at inline resize time. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 48 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b875ff55586..b8de488889d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1925,15 +1925,45 @@ done: return res; } +int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *new_groups; + int size; + + if (!sbi->s_log_groups_per_flex) + return 0; + + size = ext4_flex_group(sbi, ngroup - 1) + 1; + if (size <= sbi->s_flex_groups_allocated) + return 0; + + size = roundup_pow_of_two(size * sizeof(struct flex_groups)); + new_groups = ext4_kvzalloc(size, GFP_KERNEL); + if (!new_groups) { + ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", + size / (int) sizeof(struct flex_groups)); + return -ENOMEM; + } + + if (sbi->s_flex_groups) { + memcpy(new_groups, sbi->s_flex_groups, + (sbi->s_flex_groups_allocated * + sizeof(struct flex_groups))); + ext4_kvfree(sbi->s_flex_groups); + } + sbi->s_flex_groups = new_groups; + sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); + return 0; +} + static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - ext4_group_t flex_group_count; ext4_group_t flex_group; unsigned int groups_per_flex = 0; - size_t size; - int i; + int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { @@ -1942,17 +1972,9 @@ static int ext4_fill_flex_info(struct super_block *sb) } groups_per_flex = 1 << sbi->s_log_groups_per_flex; - /* We allocate both existing and potentially added groups */ - flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + - ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << - EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - size = flex_group_count * sizeof(struct flex_groups); - sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); - if (sbi->s_flex_groups == NULL) { - ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", - flex_group_count); + err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); + if (err) goto failed; - } for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); -- cgit v1.2.3-18-g5258 From 5e7bbef19c8385895cb21c41a88bd937902e6316 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Sep 2012 12:11:40 -0400 Subject: ext4: advertise the fact that the kernel supports meta_bg resizing Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b8de488889d..eb7722ab771 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2585,10 +2585,12 @@ static struct attribute *ext4_attrs[] = { /* Features this copy of ext4 supports */ EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); +EXT4_INFO_ATTR(meta_bg_resize); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), NULL, }; -- cgit v1.2.3-18-g5258 From bc0b75f77a944b482293972eb8fd5c88c576eb46 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 17 Sep 2012 22:54:36 -0400 Subject: ext4: do not enable delalloc by default for ext2 Signed-off-by: Brian Foster Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb7722ab771..e6784b3276b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3411,7 +3411,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb) && + if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); -- cgit v1.2.3-18-g5258 From 50df9fd55e4271e89a7adf3b1172083dd0ca199d Mon Sep 17 00:00:00 2001 From: Herton Ronaldo Krzesinski Date: Sun, 23 Sep 2012 22:49:12 -0400 Subject: ext4: fix crash when accessing /proc/mounts concurrently The crash was caused by a variable being erronously declared static in token2str(). In addition to /proc/mounts, the problem can also be easily replicated by accessing /proc/fs/ext4//options in parallel: $ cat /proc/fs/ext4//options > options.txt ... and then running the following command in two different terminals: $ while diff /proc/fs/ext4//options options.txt; do true; done This is also the cause of the following a crash while running xfstests #234, as reported in the following bug reports: https://bugs.launchpad.net/bugs/1053019 https://bugzilla.kernel.org/show_bug.cgi?id=47731 Signed-off-by: Herton Ronaldo Krzesinski Signed-off-by: "Theodore Ts'o" Cc: Brad Figg Cc: stable@vger.kernel.org --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e6784b3276b..7bef0a4bc51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1749,7 +1749,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, static const char *token2str(int token) { - static const struct match_token *t; + const struct match_token *t; for (t = tokens; t->token != Opt_err; t++) if (t->token == token && !strchr(t->pattern, '=')) -- cgit v1.2.3-18-g5258 From c25f9bc6143f4cb4dc31d2ad7a6fe4e4005fc414 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 26 Sep 2012 23:30:12 -0400 Subject: ext4: don't clear orphan list on ro mount with errors If the file system contains errors and it is being mounted read-only, don't clear the orphan list. We should minimize changes to the file system if it is mounted read-only. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7bef0a4bc51..a53a23aef5e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2177,10 +2177,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - if (es->s_last_orphan) + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { jbd_debug(1, "Errors on filesystem, " "clearing orphan list.\n"); - es->s_last_orphan = 0; + es->s_last_orphan = 0; + } jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); return; } -- cgit v1.2.3-18-g5258 From ba39ebb61401cfe0ccd58dd0cd4da88465528c0a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 27 Sep 2012 09:37:53 -0400 Subject: ext4: convert to use leXX_add_cpu() Convert cpu_to_leXX(leXX_to_cpu(E1) + E2) to use leXX_add_cpu(). dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a53a23aef5e..ae456321c5b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -431,7 +431,7 @@ static void __save_error_info(struct super_block *sb, const char *func, */ if (!es->s_error_count) mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); - es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); + le32_add_cpu(&es->s_error_count, 1); } static void save_error_info(struct super_block *sb, const char *func, -- cgit v1.2.3-18-g5258 From f45ee3a1ea438af96e4fd2c0b16d195e67ef235f Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:21:09 -0400 Subject: ext4: ext4_inode_info diet Generic inode has unused i_private pointer which may be used as cur_aio_dio storage. TODO: If cur_aio_dio will be passed as an argument to get_block_t this allow to have concurent AIO_DIO requests. Reviewed-by: Zheng Liu Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ae456321c5b..e05267ab1f8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -965,7 +965,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_completed_io_list); spin_lock_init(&ei->i_completed_io_lock); - ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); -- cgit v1.2.3-18-g5258 From e27f41e1b789e60e7d8cc9c81fd93ca49ef31f13 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 28 Sep 2012 23:24:52 -0400 Subject: ext4: give i_aiodio_unwritten a more appropriate name AIO/DIO prefix is wrong because it account unwritten extents which also may be scheduled from buffered write endio Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4/super.c') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e05267ab1f8..982f6fc22c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -968,7 +968,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_ioend_count, 0); - atomic_set(&ei->i_aiodio_unwritten, 0); + atomic_set(&ei->i_unwritten, 0); return &ei->vfs_inode; } -- cgit v1.2.3-18-g5258