diff options
author | Paul Mackerras <paulus@samba.org> | 2008-01-31 11:25:51 +1100 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2008-01-31 11:25:51 +1100 |
commit | bd45ac0c5daae35e7c71138172e63df5cf644cf6 (patch) | |
tree | 5eb5a599bf6a9d7a8a34e802db932aa9e9555de4 /fs/jbd2 | |
parent | 4eece4ccf997c0e6d8fdad3d842e37b16b8d705f (diff) | |
parent | 5bdeae46be6dfe9efa44a548bd622af325f4bdb4 (diff) |
Merge branch 'linux-2.6'
Diffstat (limited to 'fs/jbd2')
-rw-r--r-- | fs/jbd2/checkpoint.c | 25 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 257 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 368 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 151 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 6 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 34 |
6 files changed, 759 insertions, 82 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 3fccde7ba00..6914598022c 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -232,7 +232,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) + struct buffer_head **bhs, int *batch_count, + transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; @@ -250,6 +251,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, transaction_t *t = jh->b_transaction; tid_t tid = t->t_tid; + transaction->t_chp_stats.cs_forced_to_close++; spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); jbd2_log_start_commit(journal, tid); @@ -279,6 +281,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, bhs[*batch_count] = bh; __buffer_relink_io(jh); jbd_unlock_bh_state(bh); + transaction->t_chp_stats.cs_written++; (*batch_count)++; if (*batch_count == NR_BATCH) { spin_unlock(&journal->j_list_lock); @@ -322,6 +325,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) if (!journal->j_checkpoint_transactions) goto out; transaction = journal->j_checkpoint_transactions; + if (transaction->t_chp_stats.cs_chp_time == 0) + transaction->t_chp_stats.cs_chp_time = jiffies; this_tid = transaction->t_tid; restart: /* @@ -346,8 +351,10 @@ restart: retry = 1; break; } - retry = __process_buffer(journal, jh, bhs,&batch_count); - if (!retry && lock_need_resched(&journal->j_list_lock)){ + retry = __process_buffer(journal, jh, bhs, &batch_count, + transaction); + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); retry = 1; break; @@ -602,15 +609,15 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) /* * There is one special case to worry about: if we have just pulled the - * buffer off a committing transaction's forget list, then even if the - * checkpoint list is empty, the transaction obviously cannot be - * dropped! + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! * - * The locking here around j_committing_transaction is a bit sleazy. + * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ - if (transaction == journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "belongs to committing transaction"); + if (transaction->t_state != T_FINISHED) { + JBUFFER_TRACE(jh, "belongs to running/committing transaction"); goto out; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6986f334c64..4f302d27927 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -20,6 +20,8 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/jiffies.h> +#include <linux/crc32.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -92,19 +94,23 @@ static int inverted_lock(journal_t *journal, struct buffer_head *bh) return 1; } -/* Done it all: now write the commit record. We should have +/* + * Done it all: now submit the commit record. We should have * cleaned up our previous buffers by now, so if we are in abort * mode we can now just skip the rest of the journal write * entirely. * * Returns 1 if the journal needs to be aborted or 0 on success */ -static int journal_write_commit_record(journal_t *journal, - transaction_t *commit_transaction) +static int journal_submit_commit_record(journal_t *journal, + transaction_t *commit_transaction, + struct buffer_head **cbh, + __u32 crc32_sum) { struct journal_head *descriptor; + struct commit_header *tmp; struct buffer_head *bh; - int i, ret; + int ret; int barrier_done = 0; if (is_journal_aborted(journal)) @@ -116,21 +122,33 @@ static int journal_write_commit_record(journal_t *journal, bh = jh2bh(descriptor); - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < bh->b_size; i += 512) { - journal_header_t *tmp = (journal_header_t*)bh->b_data; - tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + tmp = (struct commit_header *)bh->b_data; + tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); + tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); + tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); + + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + tmp->h_chksum_type = JBD2_CRC32_CHKSUM; + tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; + tmp->h_chksum[0] = cpu_to_be32(crc32_sum); } - JBUFFER_TRACE(descriptor, "write commit block"); + JBUFFER_TRACE(descriptor, "submit commit block"); + lock_buffer(bh); + set_buffer_dirty(bh); - if (journal->j_flags & JBD2_BARRIER) { + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + if (journal->j_flags & JBD2_BARRIER && + !JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { set_buffer_ordered(bh); barrier_done = 1; } - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); + /* is it possible for another commit to fail at roughly * the same time as this one? If so, we don't want to * trust the barrier flag in the super, but instead want @@ -151,14 +169,72 @@ static int journal_write_commit_record(journal_t *journal, clear_buffer_ordered(bh); set_buffer_uptodate(bh); set_buffer_dirty(bh); - ret = sync_dirty_buffer(bh); + ret = submit_bh(WRITE, bh); } - put_bh(bh); /* One for getblk() */ - jbd2_journal_put_journal_head(descriptor); + *cbh = bh; + return ret; +} + +/* + * This function along with journal_submit_commit_record + * allows to write the commit record asynchronously. + */ +static int journal_wait_on_commit_record(struct buffer_head *bh) +{ + int ret = 0; + + clear_buffer_dirty(bh); + wait_on_buffer(bh); + + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + put_bh(bh); /* One for getblk() */ + jbd2_journal_put_journal_head(bh2jh(bh)); - return (ret == -EIO); + return ret; } +/* + * Wait for all submitted IO to complete. + */ +static int journal_wait_on_locked_list(journal_t *journal, + transaction_t *commit_transaction) +{ + int ret = 0; + struct journal_head *jh; + + while (commit_transaction->t_locked_list) { + struct buffer_head *bh; + + jh = commit_transaction->t_locked_list->b_tprev; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + ret = -EIO; + spin_lock(&journal->j_list_lock); + } + if (!inverted_lock(journal, bh)) { + put_bh(bh); + spin_lock(&journal->j_list_lock); + continue; + } + if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + __jbd2_journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + jbd2_journal_remove_journal_head(bh); + put_bh(bh); + } else { + jbd_unlock_bh_state(bh); + } + put_bh(bh); + cond_resched_lock(&journal->j_list_lock); + } + return ret; + } + static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) { int i; @@ -265,7 +341,7 @@ write_out_data: put_bh(bh); } - if (lock_need_resched(&journal->j_list_lock)) { + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { spin_unlock(&journal->j_list_lock); goto write_out_data; } @@ -274,7 +350,21 @@ write_out_data: journal_do_submit_data(wbuf, bufs); } -static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + char *addr; + __u32 checksum; + + addr = kmap_atomic(page, KM_USER0); + checksum = crc32_be(crc32_sum, + (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); + kunmap_atomic(addr, KM_USER0); + + return checksum; +} + +static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); @@ -290,6 +380,7 @@ static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag, */ void jbd2_journal_commit_transaction(journal_t *journal) { + struct transaction_stats_s stats; transaction_t *commit_transaction; struct journal_head *jh, *new_jh, *descriptor; struct buffer_head **wbuf = journal->j_wbuf; @@ -305,6 +396,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) int tag_flag; int i; int tag_bytes = journal_tag_bytes(journal); + struct buffer_head *cbh = NULL; /* For transactional checksums */ + __u32 crc32_sum = ~0; /* * First job: lock down the current transaction and wait for @@ -337,6 +430,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + stats.u.run.rs_wait = commit_transaction->t_max_wait; + stats.u.run.rs_locked = jiffies; + stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.u.run.rs_locked); + spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -407,6 +505,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + stats.u.run.rs_flushing = jiffies; + stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, + stats.u.run.rs_flushing); + commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -440,38 +542,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal_submit_data_buffers(journal, commit_transaction); /* - * Wait for all previously submitted IO to complete. + * Wait for all previously submitted IO to complete if commit + * record is to be written synchronously. */ spin_lock(&journal->j_list_lock); - while (commit_transaction->t_locked_list) { - struct buffer_head *bh; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + err = journal_wait_on_locked_list(journal, + commit_transaction); - jh = commit_transaction->t_locked_list->b_tprev; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; - spin_lock(&journal->j_list_lock); - } - if (!inverted_lock(journal, bh)) { - put_bh(bh); - spin_lock(&journal->j_list_lock); - continue; - } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { - __jbd2_journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); - cond_resched_lock(&journal->j_list_lock); - } spin_unlock(&journal->j_list_lock); if (err) @@ -498,6 +577,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ commit_transaction->t_state = T_COMMIT; + stats.u.run.rs_logging = jiffies; + stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, + stats.u.run.rs_logging); + stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.u.run.rs_blocks_logged = 0; + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { @@ -639,6 +724,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* + * Compute checksum. + */ + if (JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + crc32_sum = + jbd2_checksum_data(crc32_sum, bh); + } + lock_buffer(bh); clear_buffer_dirty(bh); set_buffer_uptodate(bh); @@ -646,6 +740,7 @@ start_journal_io: submit_bh(WRITE, bh); } cond_resched(); + stats.u.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -654,6 +749,23 @@ start_journal_io: } } + /* Done it all: now write the commit record asynchronously. */ + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + + spin_lock(&journal->j_list_lock); + err = journal_wait_on_locked_list(journal, + commit_transaction); + spin_unlock(&journal->j_list_lock); + if (err) + __jbd2_journal_abort_hard(journal); + } + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -753,8 +865,14 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); - if (journal_write_commit_record(journal, commit_transaction)) - err = -EIO; + if (!JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, + &cbh, crc32_sum); + if (err) + __jbd2_journal_abort_hard(journal); + } + err = journal_wait_on_commit_record(cbh); if (err) jbd2_journal_abort(journal, err); @@ -816,6 +934,7 @@ restart_loop: cp_transaction = jh->b_cp_transaction; if (cp_transaction) { JBUFFER_TRACE(jh, "remove from old cp transaction"); + cp_transaction->t_chp_stats.cs_dropped++; __jbd2_journal_remove_checkpoint(jh); } @@ -867,10 +986,10 @@ restart_loop: } spin_unlock(&journal->j_list_lock); /* - * This is a bit sleazy. We borrow j_list_lock to protect - * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint. - * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but - * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __jbd2_journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); @@ -890,6 +1009,36 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_start = jiffies; + stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, + commit_transaction->t_start); + + /* + * File the transaction for history + */ + stats.ts_type = JBD2_STATS_RUN; + stats.ts_tid = commit_transaction->t_tid; + stats.u.run.rs_handle_count = commit_transaction->t_handle_count; + spin_lock(&journal->j_history_lock); + memcpy(journal->j_history + journal->j_history_cur, &stats, + sizeof(stats)); + if (++journal->j_history_cur == journal->j_history_max) + journal->j_history_cur = 0; + + /* + * Calculate overall stats + */ + journal->j_stats.ts_tid++; + journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; + journal->j_stats.u.run.rs_running += stats.u.run.rs_running; + journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; + journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; + journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; + journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; + journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; + journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); + commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6ddc5531587..96ba846992e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -36,6 +36,7 @@ #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> +#include <linux/seq_file.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -640,6 +641,312 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return jbd2_journal_add_journal_head(bh); } +struct jbd2_stats_proc_session { + journal_t *journal; + struct transaction_stats_s *stats; + int start; + int max; +}; + +static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, + struct transaction_stats_s *ts, + int first) +{ + if (ts == s->stats + s->max) + ts = s->stats; + if (!first && ts == s->stats + s->start) + return NULL; + while (ts->ts_type == 0) { + ts++; + if (ts == s->stats + s->max) + ts = s->stats; + if (ts == s->stats + s->start) + return NULL; + } + return ts; + +} + +static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) +{ + struct jbd2_stats_proc_session *s = seq->private; + struct transaction_stats_s *ts; + int l = *pos; + + if (l == 0) + return SEQ_START_TOKEN; + ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); + if (!ts) + return NULL; + l--; + while (l) { + ts = jbd2_history_skip_empty(s, ++ts, 0); + if (!ts) + break; + l--; + } + return ts; +} + +static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct jbd2_stats_proc_session *s = seq->private; + struct transaction_stats_s *ts = v; + + ++*pos; + if (v == SEQ_START_TOKEN) + return jbd2_history_skip_empty(s, s->stats + s->start, 1); + else + return jbd2_history_skip_empty(s, ++ts, 0); +} + +static int jbd2_seq_history_show(struct seq_file *seq, void *v) +{ + struct transaction_stats_s *ts = v; + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " + "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", + "wait", "run", "lock", "flush", "log", "hndls", + "block", "inlog", "ctime", "write", "drop", + "close"); + return 0; + } + if (ts->ts_type == JBD2_STATS_RUN) + seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " + "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, + jiffies_to_msecs(ts->u.run.rs_wait), + jiffies_to_msecs(ts->u.run.rs_running), + jiffies_to_msecs(ts->u.run.rs_locked), + jiffies_to_msecs(ts->u.run.rs_flushing), + jiffies_to_msecs(ts->u.run.rs_logging), + ts->u.run.rs_handle_count, + ts->u.run.rs_blocks, + ts->u.run.rs_blocks_logged); + else if (ts->ts_type == JBD2_STATS_CHECKPOINT) + seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", + "C", ts->ts_tid, " ", + jiffies_to_msecs(ts->u.chp.cs_chp_time), + ts->u.chp.cs_written, ts->u.chp.cs_dropped, + ts->u.chp.cs_forced_to_close); + else + J_ASSERT(0); + return 0; +} + +static void jbd2_seq_history_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations jbd2_seq_history_ops = { + .start = jbd2_seq_history_start, + .next = jbd2_seq_history_next, + .stop = jbd2_seq_history_stop, + .show = jbd2_seq_history_show, +}; + +static int jbd2_seq_history_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE(inode)->data; + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s) * journal->j_history_max; + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, journal->j_history, size); + s->max = journal->j_history_max; + s->start = journal->j_history_cur % s->max; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_history_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_history_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static struct file_operations jbd2_seq_history_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_history_release, +}; + +static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int jbd2_seq_info_show(struct seq_file *seq, void *v) +{ + struct jbd2_stats_proc_session *s = seq->private; + + if (v != SEQ_START_TOKEN) + return 0; + seq_printf(seq, "%lu transaction, each upto %u blocks\n", + s->stats->ts_tid, + s->journal->j_max_transaction_buffers); + if (s->stats->ts_tid == 0) + return 0; + seq_printf(seq, "average: \n %ums waiting for transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums running transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); + seq_printf(seq, " %ums transaction was being locked\n", + jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); + seq_printf(seq, " %ums flushing data (in ordered mode)\n", + jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); + seq_printf(seq, " %ums logging transaction\n", + jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %lu handles per transaction\n", + s->stats->u.run.rs_handle_count / s->stats->ts_tid); + seq_printf(seq, " %lu blocks per transaction\n", + s->stats->u.run.rs_blocks / s->stats->ts_tid); + seq_printf(seq, " %lu logged blocks per transaction\n", + s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); + return 0; +} + +static void jbd2_seq_info_stop(struct seq_file *seq, void *v) +{ +} + +static struct seq_operations jbd2_seq_info_ops = { + .start = jbd2_seq_info_start, + .next = jbd2_seq_info_next, + .stop = jbd2_seq_info_stop, + .show = jbd2_seq_info_show, +}; + +static int jbd2_seq_info_open(struct inode *inode, struct file *file) +{ + journal_t *journal = PDE(inode)->data; + struct jbd2_stats_proc_session *s; + int rc, size; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (s == NULL) + return -ENOMEM; + size = sizeof(struct transaction_stats_s); + s->stats = kmalloc(size, GFP_KERNEL); + if (s->stats == NULL) { + kfree(s); + return -ENOMEM; + } + spin_lock(&journal->j_history_lock); + memcpy(s->stats, &journal->j_stats, size); + s->journal = journal; + spin_unlock(&journal->j_history_lock); + + rc = seq_open(file, &jbd2_seq_info_ops); + if (rc == 0) { + struct seq_file *m = file->private_data; + m->private = s; + } else { + kfree(s->stats); + kfree(s); + } + return rc; + +} + +static int jbd2_seq_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct jbd2_stats_proc_session *s = seq->private; + kfree(s->stats); + kfree(s); + return seq_release(inode, file); +} + +static struct file_operations jbd2_seq_info_fops = { + .owner = THIS_MODULE, + .open = jbd2_seq_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = jbd2_seq_info_release, +}; + +static struct proc_dir_entry *proc_jbd2_stats; + +static void jbd2_stats_proc_init(journal_t *journal) +{ + char name[BDEVNAME_SIZE]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); + journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); + if (journal->j_proc_entry) { + struct proc_dir_entry *p; + p = create_proc_entry("history", S_IRUGO, + journal->j_proc_entry); + if (p) { + p->proc_fops = &jbd2_seq_history_fops; + p->data = journal; + p = create_proc_entry("info", S_IRUGO, + journal->j_proc_entry); + if (p) { + p->proc_fops = &jbd2_seq_info_fops; + p->data = journal; + } + } + } +} + +static void jbd2_stats_proc_exit(journal_t *journal) +{ + char name[BDEVNAME_SIZE]; + + snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); + remove_proc_entry("info", journal->j_proc_entry); + remove_proc_entry("history", journal->j_proc_entry); + remove_proc_entry(name, proc_jbd2_stats); +} + +static void journal_init_stats(journal_t *journal) +{ + int size; + + if (!proc_jbd2_stats) + return; + + journal->j_history_max = 100; + size = sizeof(struct transaction_stats_s) * journal->j_history_max; + journal->j_history = kzalloc(size, GFP_KERNEL); + if (!journal->j_history) { + journal->j_history_max = 0; + return; + } + spin_lock_init(&journal->j_history_lock); +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -681,6 +988,9 @@ static journal_t * journal_init_common (void) kfree(journal); goto fail; } + + journal_init_stats(journal); + return journal; fail: return NULL; @@ -735,6 +1045,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; + jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); J_ASSERT(bh != NULL); @@ -773,6 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; journal->j_blocksize = inode->i_sb->s_blocksize; + jbd2_stats_proc_init(journal); /* journal descriptor can store up to n blocks -bzzz */ n = journal->j_blocksize / sizeof(journal_block_tag_t); @@ -1153,6 +1465,8 @@ void jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } + if (journal->j_proc_entry) + jbd2_stats_proc_exit(journal); if (journal->j_inode) iput(journal->j_inode); if (journal->j_revoke) @@ -1264,6 +1578,32 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, return 1; } +/* + * jbd2_journal_clear_features () - Clear a given journal feature in the + * superblock + * @journal: Journal to act on. + * @compat: bitmask of compatible features + * @ro: bitmask of features that force read-only mount + * @incompat: bitmask of incompatible features + * + * Clear a given journal feature as present on the + * superblock. + */ +void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) +{ + journal_superblock_t *sb; + + jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat &= ~cpu_to_be32(compat); + sb->s_feature_ro_compat &= ~cpu_to_be32(ro); + sb->s_feature_incompat &= ~cpu_to_be32(incompat); +} +EXPORT_SYMBOL(jbd2_journal_clear_features); /** * int jbd2_journal_update_format () - Update on-disk journal structure. @@ -1633,7 +1973,7 @@ static int journal_init_jbd2_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - 0, /* flags */ + SLAB_TEMPORARY, /* flags */ NULL); /* ctor */ retval = 0; if (jbd2_journal_head_cache == 0) { @@ -1900,6 +2240,28 @@ static void __exit jbd2_remove_debugfs_entry(void) #endif +#ifdef CONFIG_PROC_FS + +#define JBD2_STATS_PROC_NAME "fs/jbd2" + +static void __init jbd2_create_jbd_stats_proc_entry(void) +{ + proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL); +} + +static void __exit jbd2_remove_jbd_stats_proc_entry(void) +{ + if (proc_jbd2_stats) + remove_proc_entry(JBD2_STATS_PROC_NAME, NULL); +} + +#else + +#define jbd2_create_jbd_stats_proc_entry() do {} while (0) +#define jbd2_remove_jbd_stats_proc_entry() do {} while (0) + +#endif + struct kmem_cache *jbd2_handle_cache; static int __init journal_init_handle_cache(void) @@ -1907,7 +2269,7 @@ static int __init journal_init_handle_cache(void) jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle", sizeof(handle_t), 0, /* offset */ - 0, /* flags */ + SLAB_TEMPORARY, /* flags */ NULL); /* ctor */ if (jbd2_handle_cache == NULL) { printk(KERN_EMERG "JBD: failed to create handle cache\n"); @@ -1955,6 +2317,7 @@ static int __init journal_init(void) if (ret != 0) jbd2_journal_destroy_caches(); jbd2_create_debugfs_entry(); + jbd2_create_jbd_stats_proc_entry(); return ret; } @@ -1966,6 +2329,7 @@ static void __exit journal_exit(void) printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); #endif jbd2_remove_debugfs_entry(); + jbd2_remove_jbd_stats_proc_entry(); jbd2_journal_destroy_caches(); } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d0ce627539e..921680663fa 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -21,6 +21,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/crc32.h> #endif /* @@ -316,6 +317,37 @@ static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag return block; } +/* + * calc_chksums calculates the checksums for the blocks described in the + * descriptor block. + */ +static int calc_chksums(journal_t *journal, struct buffer_head *bh, + unsigned long *next_log_block, __u32 *crc32_sum) +{ + int i, num_blks, err; + unsigned long io_block; + struct buffer_head *obh; + + num_blks = count_tags(journal, bh); + /* Calculate checksum of the descriptor block. */ + *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); + + for (i = 0; i < num_blks; i++) { + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + printk(KERN_ERR "JBD: IO error %d recovering block " + "%lu in log\n", err, io_block); + return 1; + } else { + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } + } + return 0; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -328,6 +360,7 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; int tag_bytes = journal_tag_bytes(journal); + __u32 crc32_sum = ~0; /* Transactional Checksums */ /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; @@ -419,12 +452,26 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* If it is a valid descriptor block, replay it - * in pass REPLAY; otherwise, just skip over the - * blocks it describes. */ + * in pass REPLAY; if journal_checksums enabled, then + * calculate checksums in PASS_SCAN, otherwise, + * just skip over the blocks it describes. */ if (pass != PASS_REPLAY) { + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM) && + !info->end_transaction) { + if (calc_chksums(journal, bh, + &next_log_block, + &crc32_sum)) { + put_bh(bh); + break; + } + put_bh(bh); + continue; + } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - brelse(bh); + put_bh(bh); continue; } @@ -516,9 +563,96 @@ static int do_one_pass(journal_t *journal, continue; case JBD2_COMMIT_BLOCK: - /* Found an expected commit block: not much to - * do other than move on to the next sequence + /* How to differentiate between interrupted commit + * and journal corruption ? + * + * {nth transaction} + * Checksum Verification Failed + * | + * ____________________ + * | | + * async_commit sync_commit + * | | + * | GO TO NEXT "Journal Corruption" + * | TRANSACTION + * | + * {(n+1)th transanction} + * | + * _______|______________ + * | | + * Commit block found Commit block not found + * | | + * "Journal Corruption" | + * _____________|_________ + * | | + * nth trans corrupt OR nth trans + * and (n+1)th interrupted interrupted + * before commit block + * could reach the disk. + * (Cannot find the difference in above + * mentioned conditions. Hence assume + * "Interrupted Commit".) + */ + + /* Found an expected commit block: if checksums + * are present verify them in PASS_SCAN; else not + * much to do other than move on to the next sequence * number. */ + if (pass == PASS_SCAN && + JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_COMPAT_CHECKSUM)) { + int chksum_err, chksum_seen; + struct commit_header *cbh = + (struct commit_header *)bh->b_data; + unsigned found_chksum = + be32_to_cpu(cbh->h_chksum[0]); + + chksum_err = chksum_seen = 0; + + if (info->end_transaction) { + printk(KERN_ERR "JBD: Transaction %u " + "found to be corrupt.\n", + next_commit_ID - 1); + brelse(bh); + break; + } + + if (crc32_sum == found_chksum && + cbh->h_chksum_type == JBD2_CRC32_CHKSUM && + cbh->h_chksum_size == + JBD2_CRC32_CHKSUM_SIZE) + chksum_seen = 1; + else if (!(cbh->h_chksum_type == 0 && + cbh->h_chksum_size == 0 && + found_chksum == 0 && + !chksum_seen)) + /* + * If fs is mounted using an old kernel and then + * kernel with journal_chksum is used then we + * get a situation where the journal flag has + * checksum flag set but checksums are not + * present i.e chksum = 0, in the individual + * commit blocks. + * Hence to avoid checksum failures, in this + * situation, this extra check is added. + */ + chksum_err = 1; + + if (chksum_err) { + info->end_transaction = next_commit_ID; + + if (!JBD2_HAS_COMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + printk(KERN_ERR + "JBD: Transaction %u " + "found to be corrupt.\n", + next_commit_ID); + brelse(bh); + break; + } + } + crc32_sum = ~0; + } brelse(bh); next_commit_ID++; continue; @@ -554,9 +688,10 @@ static int do_one_pass(journal_t *journal, * transaction marks the end of the valid log. */ - if (pass == PASS_SCAN) - info->end_transaction = next_commit_ID; - else { + if (pass == PASS_SCAN) { + if (!info->end_transaction) + info->end_transaction = next_commit_ID; + } else { /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ if (info->end_transaction != next_commit_ID) { diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 3595fd432d5..df36f42e19e 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -171,13 +171,15 @@ int __init jbd2_journal_init_revoke_caches(void) { jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", sizeof(struct jbd2_revoke_record_s), - 0, SLAB_HWCACHE_ALIGN, NULL); + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); if (jbd2_revoke_record_cache == 0) return -ENOMEM; jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", sizeof(struct jbd2_revoke_table_s), - 0, 0, NULL); + 0, SLAB_TEMPORARY, NULL); if (jbd2_revoke_table_cache == 0) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b1fcf2b3dca..b9b0b6f899b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -54,11 +54,13 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = transaction->t_expires; + journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; + transaction->t_max_wait = 0; + transaction->t_start = jiffies; return transaction; } @@ -85,6 +87,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle) int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; int ret = 0; + unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", @@ -217,6 +220,12 @@ repeat_locked: /* OK, account for the buffers that this operation expects to * use and add the handle to the running transaction. */ + if (time_after(transaction->t_start, ts)) { + ts = jbd2_time_diff(ts, transaction->t_start); + if (ts > transaction->t_max_wait) + transaction->t_max_wait = ts; + } + handle->h_transaction = transaction; transaction->t_outstanding_credits += nblocks; transaction->t_updates++; @@ -232,6 +241,8 @@ out: return ret; } +static struct lock_class_key jbd2_handle_key; + /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { @@ -242,6 +253,9 @@ static handle_t *new_handle(int nblocks) handle->h_buffer_credits = nblocks; handle->h_ref = 1; + lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", + &jbd2_handle_key, 0); + return handle; } @@ -284,7 +298,11 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) jbd2_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); + goto out; } + + lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_); +out: return handle; } @@ -1164,7 +1182,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == 0); + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); @@ -1410,6 +1428,8 @@ int jbd2_journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } + lock_release(&handle->h_lockdep_map, 1, _THIS_IP_); + jbd2_free_handle(handle); return err; } @@ -1512,7 +1532,7 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) - J_ASSERT_JH(jh, transaction != 0); + J_ASSERT_JH(jh, transaction != NULL); switch (jh->b_jlist) { case BJ_None: @@ -1581,11 +1601,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != 0) + if (jh->b_next_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); @@ -1593,7 +1613,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_remove_journal_head(bh); __brelse(bh); } - } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1953,7 +1973,7 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_transaction == 0); + jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; |