From 74d553aad7926ed05e05d9d5cff516a7b31375fc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 12:39:17 -0400 Subject: ext4: collapse handling of data=ordered and data=writeback codepaths The only difference between how we handle data=ordered and data=writeback is a single call to ext4_jbd2_file_inode(). Eliminate code duplication by factoring out redundant the code paths. Signed-off-by: "Theodore Ts'o" Reviewed-by: Lukas Czerner --- include/trace/events/ext4.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 4ee47100385..58459b78556 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -257,15 +257,7 @@ DECLARE_EVENT_CLASS(ext4__write_end, __entry->pos, __entry->len, __entry->copied) ); -DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, - - TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, - unsigned int copied), - - TP_ARGS(inode, pos, len, copied) -); - -DEFINE_EVENT(ext4__write_end, ext4_writeback_write_end, +DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), -- cgit v1.2.3-18-g5258 From d76a3a77113db020d9bb1e894822869410450bd9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Apr 2013 22:02:52 -0400 Subject: ext4/jbd2: don't wait (forever) for stale tid caused by wraparound In the case where an inode has a very stale transaction id (tid) in i_datasync_tid or i_sync_tid, it's possible that after a very large (2**31) number of transactions, that the tid number space might wrap, causing tid_geq()'s calculations to fail. Commit deeeaf13 "jbd2: fix fsync() tid wraparound bug", later modified by commit e7b04ac0 "jbd2: don't wake kjournald unnecessarily", attempted to fix this problem, but it only avoided kjournald spinning forever by fixing the logic in jbd2_log_start_commit(). Unfortunately, in the codepaths in fs/ext4/fsync.c and fs/ext4/inode.c that might call jbd2_log_start_commit() with a stale tid, those functions will subsequently call jbd2_log_wait_commit() with the same stale tid, and then wait for a very long time. To fix this, we replace the calls to jbd2_log_start_commit() and jbd2_log_wait_commit() with a call to a new function, jbd2_complete_transaction(), which will correctly handle stale tid's. As a bonus, jbd2_complete_transaction() will avoid locking j_state_lock for writing unless a commit needs to be started. This should have a small (but probably not measurable) improvement for ext4's scalability. Signed-off-by: "Theodore Ts'o" Reported-by: Ben Hutchings Reported-by: George Barnett Cc: stable@vger.kernel.org --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 50e5a5e6a71..f0289754b46 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1200,6 +1200,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_force_commit_nested(journal_t *journal); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); +int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); -- cgit v1.2.3-18-g5258 From 794446c6946513c684d448205fbd76fa35f38b72 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Apr 2013 22:06:52 -0400 Subject: jbd2: fix race between jbd2_journal_remove_checkpoint and ->j_commit_callback The following race is possible: [kjournald2] other_task jbd2_journal_commit_transaction() j_state = T_FINISHED; spin_unlock(&journal->j_list_lock); ->jbd2_journal_remove_checkpoint() ->jbd2_journal_free_transaction(); ->kmem_cache_free(transaction) ->j_commit_callback(journal, transaction); -> USE_AFTER_FREE WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250() Hardware name: list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod Pid: 16400, comm: jbd2/dm-1-8 Tainted: G W 3.8.0-rc3+ #107 Call Trace: [] warn_slowpath_common+0xad/0xf0 [] warn_slowpath_fmt+0x46/0x50 [] ? ext4_journal_commit_callback+0x99/0xc0 [] __list_del_entry+0x1c0/0x250 [] ext4_journal_commit_callback+0x6f/0xc0 [] jbd2_journal_commit_transaction+0x23a6/0x2570 [] ? try_to_del_timer_sync+0x82/0xa0 [] ? del_timer_sync+0x91/0x1e0 [] kjournald2+0x19f/0x6a0 [] ? wake_up_bit+0x40/0x40 [] ? bit_spin_lock+0x80/0x80 [] kthread+0x10e/0x120 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_worker+0x70/0x70 In order to demonstrace this issue one should mount ext4 with mount -o discard option on SSD disk. This makes callback longer and race window becomes wider. In order to fix this we should mark transaction as finished only after callbacks have completed Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f0289754b46..f9fe88957b7 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -480,6 +480,7 @@ struct transaction_s T_COMMIT, T_COMMIT_DFLUSH, T_COMMIT_JFLUSH, + T_COMMIT_CALLBACK, T_FINISHED } t_state; -- cgit v1.2.3-18-g5258 From d6a771056b32146da1280f7872f6936b0c7770ea Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 9 Apr 2013 23:59:55 -0400 Subject: ext4: fix miscellaneous big endian warnings None of these result in any bug, but they makes sparse complain. Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 58459b78556..d0e686402df 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1948,7 +1948,7 @@ TRACE_EVENT(ext4_remove_blocks, __entry->to = to; __entry->partial = partial_cluster; __entry->ee_pblk = ext4_ext_pblock(ex); - __entry->ee_lblk = cpu_to_le32(ex->ee_block); + __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); ), @@ -2052,7 +2052,7 @@ TRACE_EVENT(ext4_ext_remove_space, TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, int depth, - ext4_lblk_t partial, unsigned short eh_entries), + ext4_lblk_t partial, __le16 eh_entries), TP_ARGS(inode, start, depth, partial, eh_entries), @@ -2071,7 +2071,7 @@ TRACE_EVENT(ext4_ext_remove_space_done, __entry->start = start; __entry->depth = depth; __entry->partial = partial; - __entry->eh_entries = eh_entries; + __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u depth %d partial %u " -- cgit v1.2.3-18-g5258 From ae4647fb7654676fc44a97e86eb35f9f06b99f66 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 12 Apr 2013 00:03:42 -0400 Subject: jbd2: reduce journal_head size Remove unused t_cow_tid field (ext4 copy-on-write support doesn't seem to be happening) and change b_modified and b_jlist to bitfields thus saving 8 bytes in the structure. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- include/linux/journal-head.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h index c18b46f8aee..13a3da25ff0 100644 --- a/include/linux/journal-head.h +++ b/include/linux/journal-head.h @@ -31,21 +31,14 @@ struct journal_head { /* * Journalling list for this buffer [jbd_lock_bh_state()] */ - unsigned b_jlist; + unsigned b_jlist:4; /* * This flag signals the buffer has been modified by * the currently running transaction * [jbd_lock_bh_state()] */ - unsigned b_modified; - - /* - * This feild tracks the last transaction id in which this buffer - * has been cowed - * [jbd_lock_bh_state()] - */ - tid_t b_cow_tid; + unsigned b_modified:1; /* * Copy of the buffer data frozen for writing to the log. -- cgit v1.2.3-18-g5258 From 28daf4fae8693d4a285123494899fe01950cba50 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Fri, 19 Apr 2013 17:49:23 -0400 Subject: jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset The jbd2_alloc_handle() function is only called by new_handle(). So this commit uses kmem_cache_zalloc() instead of kmem_cache_alloc()/memset(). Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f9fe88957b7..6e051f472ed 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1145,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache; static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags) { - return kmem_cache_alloc(jbd2_handle_cache, gfp_flags); + return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags); } static inline void jbd2_free_handle(handle_t *handle) -- cgit v1.2.3-18-g5258 From 877f962c5edacfef60ab21cfed6d8d54ce25b8a6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Apr 2013 19:58:37 -0400 Subject: buffer: add BH_Prio and BH_Meta flags Add buffer_head flags so that buffer cache writebacks can be marked with the the appropriate request flags, so that metadata blocks can be marked appropriately in blktrace. Signed-off-by: "Theodore Ts'o" --- include/linux/buffer_head.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5afc4f94d11..33c0f8103fe 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -34,6 +34,8 @@ enum bh_state_bits { BH_Write_EIO, /* I/O error on write */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ + BH_Meta, /* Buffer contains metadata */ + BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) BUFFER_FNS(Unwritten, unwritten) +BUFFER_FNS(Meta, meta) +BUFFER_FNS(Prio, prio) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) -- cgit v1.2.3-18-g5258 From f783f091e49ce4896e6b026af82d76e0537c6089 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 21 Apr 2013 16:47:54 -0400 Subject: jbd2: trace when lock_buffer in do_get_write_access takes a long time While investigating interactivity problems it was clear that processes sometimes stall for long periods of times if an attempt is made to lock a buffer which is undergoing writeback. It would stall in a trace looking something like [] __lock_buffer+0x2e/0x30 [] do_get_write_access+0x43f/0x4b0 [] jbd2_journal_get_write_access+0x2b/0x50 [] __ext4_journal_get_write_access+0x39/0x80 [] ext4_reserve_inode_write+0x78/0xa0 [] ext4_mark_inode_dirty+0x49/0x220 [] ext4_dirty_inode+0x41/0x60 [] __mark_inode_dirty+0x4e/0x2d0 [] update_time+0x79/0xc0 [] file_update_time+0x98/0x100 [] __generic_file_aio_write+0x17c/0x3b0 [] generic_file_aio_write+0x7a/0xf0 [] ext4_file_write+0x83/0xd0 [] do_sync_write+0xa3/0xe0 [] vfs_write+0xae/0x180 [] sys_write+0x4d/0x90 [] system_call_fastpath+0x1a/0x1f [] 0xffffffffffffffff Signed-off-by: Mel Gorman Signed-off-by: "Theodore Ts'o" --- include/trace/events/jbd2.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 070df49e4a1..c1d1f3eb242 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -358,6 +358,27 @@ TRACE_EVENT(jbd2_write_superblock, MINOR(__entry->dev), __entry->write_op) ); +TRACE_EVENT(jbd2_lock_buffer_stall, + + TP_PROTO(dev_t dev, unsigned long stall_ms), + + TP_ARGS(dev, stall_ms), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field(unsigned long, stall_ms ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->stall_ms = stall_ms; + ), + + TP_printk("dev %d,%d stall_ms %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->stall_ms) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ -- cgit v1.2.3-18-g5258