diff options
Diffstat (limited to 'fs/jbd')
| -rw-r--r-- | fs/jbd/checkpoint.c | 77 | ||||
| -rw-r--r-- | fs/jbd/commit.c | 229 | ||||
| -rw-r--r-- | fs/jbd/journal.c | 510 | ||||
| -rw-r--r-- | fs/jbd/recovery.c | 39 | ||||
| -rw-r--r-- | fs/jbd/revoke.c | 66 | ||||
| -rw-r--r-- | fs/jbd/transaction.c | 380 |
6 files changed, 782 insertions, 519 deletions
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 61f32f3868c..08c03044abd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -22,6 +22,8 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Unlink a buffer from a transaction checkpoint list. @@ -95,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -220,8 +226,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -240,7 +246,6 @@ restart: */ released = __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } @@ -253,8 +258,13 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + + blk_start_plug(&plug); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); - ll_rw_block(SWRITE, *batch_count, bhs); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -281,7 +291,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, int ret = 0; if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -302,12 +312,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -356,6 +366,7 @@ int log_do_checkpoint(journal_t *journal) * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -442,8 +453,6 @@ out: * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. * - * Called with the journal lock held. - * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed @@ -456,18 +465,19 @@ int cleanup_journal_tail(journal_t *journal) { transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned int blocknr, freed; if (is_journal_aborted(journal)) return 1; - /* OK, work out the oldest transaction remaining in the log, and + /* + * OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * * If the log is now empty, we need to work out which is the * next transaction ID we will write, and where it will - * start. */ - + * start. + */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; @@ -493,7 +503,24 @@ int cleanup_journal_tail(journal_t *journal) spin_unlock(&journal->j_state_lock); return 1; } + spin_unlock(&journal->j_state_lock); + + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by log_do_checkpoint() --- are flushed out before we + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. + */ + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); + spin_lock(&journal->j_state_lock); /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -501,17 +528,16 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", journal->j_tail_sequence, first_tid, blocknr, freed); journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); return 0; } @@ -521,11 +547,11 @@ int cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and release + * them. * - * Called with the journal locked. * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) @@ -630,8 +656,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -650,13 +676,14 @@ int __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -667,10 +694,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -682,7 +707,6 @@ int __journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -701,6 +725,8 @@ void __journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { @@ -750,6 +776,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 618e21c0b7a..bb217dcb41a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -17,10 +17,11 @@ #include <linux/fs.h> #include <linux/jbd.h> #include <linux/errno.h> -#include <linux/slab.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/bio.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -85,7 +86,12 @@ nope: static void release_data_buffer(struct buffer_head *bh) { if (buffer_freed(bh)) { + WARN_ON_ONCE(buffer_dirty(bh)); clear_buffer_freed(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; release_buffer_page(bh); } else put_bh(bh); @@ -120,7 +126,6 @@ static int journal_write_commit_record(journal_t *journal, struct buffer_head *bh; journal_header_t *header; int ret; - int barrier_done = 0; if (is_journal_aborted(journal)) return 0; @@ -138,34 +143,12 @@ static int journal_write_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "write commit block"); set_buffer_dirty(bh); - if (journal->j_flags & JFS_BARRIER) { - set_buffer_ordered(bh); - barrier_done = 1; - } - ret = sync_dirty_buffer(bh); - if (barrier_done) - clear_buffer_ordered(bh); - /* is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want - * to remember if we sent a barrier request - */ - if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); - spin_lock(&journal->j_state_lock); - journal->j_flags &= ~JFS_BARRIER; - spin_unlock(&journal->j_state_lock); - - /* And try again, without the barrier */ - set_buffer_uptodate(bh); - set_buffer_dirty(bh); + if (journal->j_flags & JFS_BARRIER) + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); + else ret = sync_dirty_buffer(bh); - } + put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); @@ -179,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(write_op, wbuf[i]); + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); } } @@ -227,6 +219,8 @@ write_out_data: if (!trylock_buffer(bh)) { BUFFER_TRACE(bh, "needs blocking lock"); spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); /* Write out all data to prevent deadlocks */ journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; @@ -259,6 +253,8 @@ write_out_data: jbd_unlock_bh_state(bh); if (bufs == journal->j_wbufsize) { spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); bufs = 0; goto write_out_data; @@ -276,10 +272,6 @@ write_out_data: jbd_unlock_bh_state(bh); if (locked) unlock_buffer(bh); - journal_remove_journal_head(bh); - /* One for our safety reference, other for - * journal_remove_journal_head() */ - put_bh(bh); release_data_buffer(bh); } @@ -289,6 +281,7 @@ write_out_data: } } spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); journal_do_submit_data(wbuf, bufs, write_op); return err; @@ -308,7 +301,7 @@ void journal_commit_transaction(journal_t *journal) int bufs; int flags; int err; - unsigned long blocknr; + unsigned int blocknr; ktime_t start_time; u64 commit_time; char *tagp = NULL; @@ -318,6 +311,7 @@ void journal_commit_transaction(journal_t *journal) int first_tag = 0; int tag_flag; int i; + struct blk_plug plug; int write_op = WRITE; /* @@ -325,16 +319,19 @@ void journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -343,21 +340,16 @@ void journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; - /* - * Use plugged writes here, since we want to submit several before - * we unplug the device. We don't do explicit unplugging in here, - * instead we rely on sync_buffer() doing the unplug for us. - */ - if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC_PLUG; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -392,7 +384,7 @@ void journal_commit_transaction(journal_t *journal) * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same - * buffer are perfectly permissable. + * buffer are perfectly permissible. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -424,10 +416,17 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + journal_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; @@ -438,12 +437,17 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ + blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, write_op); + blk_finish_plug(&plug); /* * Wait for all previously submitted IO to complete. @@ -480,14 +484,9 @@ void journal_commit_transaction(journal_t *journal) } if (buffer_jbd(bh) && bh2jh(bh) == jh && jh->b_transaction == commit_transaction && - jh->b_jlist == BJ_Locked) { + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } + jbd_unlock_bh_state(bh); release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } @@ -504,6 +503,8 @@ void journal_commit_transaction(journal_t *journal) err = 0; } + blk_start_plug(&plug); + journal_write_revoke_records(journal, commit_transaction, write_op); /* @@ -525,6 +526,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_COMMIT; spin_unlock(&journal->j_state_lock); + trace_jbd_commit_logging(journal, commit_transaction); J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -611,13 +613,13 @@ void journal_commit_transaction(journal_t *journal) /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ - atomic_inc(&jh2bh(jh)->b_count); + get_bh(jh2bh(jh)); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ - set_bit(BH_JWrite, &jh2bh(jh)->b_state); + set_buffer_jwrite(jh2bh(jh)); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. @@ -627,7 +629,7 @@ void journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + set_buffer_jwrite(jh2bh(new_jh)); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor @@ -674,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(write_op, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); @@ -685,6 +697,8 @@ start_journal_io: } } + blk_finish_plug(&plug); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -737,7 +751,7 @@ wait_for_iobuf: shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); - clear_bit(BH_JWrite, &bh->b_state); + clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need @@ -746,8 +760,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); @@ -787,6 +806,12 @@ wait_for_iobuf: jbd_debug(3, "JBD: commit phase 6\n"); + /* All metadata is written, now write commit record and do cleanup */ + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_RECORD; + spin_unlock(&journal->j_state_lock); + if (journal_write_commit_record(journal, commit_transaction)) err = -EIO; @@ -816,10 +841,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -859,17 +890,35 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another - * use in a different page. */ + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ if (buffer_freed(bh)) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather throughout in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { @@ -877,28 +926,27 @@ restart_loop: __journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); @@ -924,7 +972,7 @@ restart_loop: jbd_debug(3, "JBD: commit phase 8\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); @@ -965,6 +1013,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b..06fe11e0abf 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -36,6 +36,10 @@ #include <linux/poison.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> +#include <linux/ratelimit.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/jbd.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -73,6 +77,7 @@ EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); EXPORT_SYMBOL(journal_clear_err); EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); EXPORT_SYMBOL(journal_start_commit); EXPORT_SYMBOL(journal_force_commit_nested); EXPORT_SYMBOL(journal_wipe); @@ -83,6 +88,25 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static const char *journal_dev_name(journal_t *journal, char *buffer); + +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif /* * Helper function used to manage commit timeouts @@ -123,6 +147,8 @@ static int kjournald(void *arg) setup_timer(&journal->j_commit_timer, commit_timeout, (unsigned long)current); + set_freezable(); + /* Record that the journal thread is running */ journal->j_task = current; wake_up(&journal->j_wait_done_commit); @@ -160,7 +186,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&journal->j_state_lock); } else { /* @@ -276,7 +302,7 @@ static void journal_kill_thread(journal_t *journal) int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - unsigned long blocknr) + unsigned int blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -287,6 +313,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +327,9 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -316,7 +346,7 @@ repeat: new_offset = offset_in_page(jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); /* * Check for escaping */ @@ -325,7 +355,7 @@ repeat: need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); /* * Do we need to do a data copy? @@ -342,9 +372,9 @@ repeat: } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); new_page = virt_to_page(tmp); new_offset = offset_in_page(tmp); @@ -356,19 +386,11 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +407,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -432,11 +458,15 @@ int __log_space_left(journal_t *journal) int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_commit_request != target && + journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* - * We want a new commit: OK, mark the request and wakup the + * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. */ @@ -446,7 +476,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } @@ -514,8 +551,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) ret = 1; } else if (journal->j_committing_transaction) { /* - * If ext3_write_super() recently started a commit, then we - * have to wait for completion of that transaction + * If commit has been started, then we have to wait for + * completion of that transaction. */ if (ptid) *ptid = journal->j_committing_transaction->t_tid; @@ -536,13 +573,25 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); @@ -552,22 +601,53 @@ int log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); spin_lock(&journal->j_state_lock); } +out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } /* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JFS_BARRIER)) + return 0; + spin_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + /* + * Transaction is being committed and we already proceeded to + * writing commit record? + */ + commit_trans = journal->j_committing_transaction; + if (commit_trans && commit_trans->t_tid == tid && + commit_trans->t_state >= T_COMMIT_RECORD) + goto out; + ret = 1; +out: + spin_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(journal_trans_will_send_data_barrier); + +/* * Log buffer allocation routines: */ -int journal_next_log_block(journal_t *journal, unsigned long *retp) +int journal_next_log_block(journal_t *journal, unsigned int *retp) { - unsigned long blocknr; + unsigned int blocknr; spin_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); @@ -588,11 +668,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp) * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ -int journal_bmap(journal_t *journal, unsigned long blocknr, - unsigned long *retp) +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) { int err = 0; - unsigned long ret; + unsigned int ret; if (journal->j_inode) { ret = bmap(journal->j_inode, blocknr); @@ -602,7 +682,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, char b[BDEVNAME_SIZE]; printk(KERN_ALERT "%s: journal block not found " - "at offset %lu on %s\n", + "at offset %u on %s\n", __func__, blocknr, bdevname(journal->j_dev, b)); @@ -628,7 +708,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, struct journal_head *journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; - unsigned long blocknr; + unsigned int blocknr; int err; err = journal_next_log_block(journal, &blocknr); @@ -671,7 +751,6 @@ static journal_t * journal_init_common (void) init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); - mutex_init(&journal->j_barrier); mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); @@ -733,7 +812,7 @@ journal_t * journal_init_dev(struct block_device *bdev, journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", __func__); goto out_err; } @@ -754,6 +833,7 @@ journal_t * journal_init_dev(struct block_device *bdev, return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } @@ -772,7 +852,7 @@ journal_t * journal_init_inode (struct inode *inode) journal_t *journal = journal_init_common(); int err; int n; - unsigned long blocknr; + unsigned int blocknr; if (!journal) return NULL; @@ -793,7 +873,7 @@ journal_t * journal_init_inode (struct inode *inode) journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", __func__); goto out_err; } @@ -801,7 +881,7 @@ journal_t * journal_init_inode (struct inode *inode) err = journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", + printk(KERN_ERR "%s: Cannot locate journal superblock\n", __func__); goto out_err; } @@ -818,6 +898,7 @@ journal_t * journal_init_inode (struct inode *inode) return journal; out_err: + kfree(journal->j_wbuf); kfree(journal); return NULL; } @@ -844,10 +925,16 @@ static void journal_fail_superblock (journal_t *journal) static int journal_reset(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; - unsigned long first, last; + unsigned int first, last; first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; @@ -862,8 +949,33 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } return journal_start_thread(journal); } @@ -877,7 +989,7 @@ static int journal_reset(journal_t *journal) **/ int journal_create(journal_t *journal) { - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; journal_superblock_t *sb; int i, err; @@ -907,6 +1019,8 @@ int journal_create(journal_t *journal) if (err) return err; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); memset (bh->b_data, 0, journal->j_blocksize); BUFFER_TRACE(bh, "marking dirty"); @@ -938,62 +1052,131 @@ int journal_create(journal_t *journal) return journal_reset(journal); } +static void journal_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD: previous I/O error detected " + "for journal superblock update for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + /** - * void journal_update_superblock() - Update journal sb on disk. + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. */ -void journal_update_superblock(journal_t *journal, int wait) +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; - struct buffer_head *bh = journal->j_sb_buffer; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + journal_write_superblock(journal, write_op); + /* Log is no longer empty */ spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + spin_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + spin_unlock(&journal->j_state_lock); + return; + } + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) - sync_dirty_buffer(bh); - else - ll_rw_block(SWRITE, 1, &bh); + journal_write_superblock(journal, WRITE_FUA); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); } /* @@ -1049,6 +1232,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: @@ -1146,6 +1337,7 @@ int journal_destroy(journal_t *journal) { int err = 0; + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); @@ -1155,6 +1347,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1170,16 +1364,14 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1237,13 +1429,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, int journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1363,7 +1551,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; spin_lock(&journal->j_state_lock); @@ -1398,6 +1585,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1405,14 +1593,9 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1437,7 +1620,6 @@ int journal_flush(journal_t *journal) int journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JFS_LOADED)); @@ -1446,8 +1628,6 @@ int journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; @@ -1455,8 +1635,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) - journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1524,7 +1708,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** @@ -1682,22 +1866,19 @@ static void journal_destroy_journal_head_cache(void) static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; - static unsigned long last_warning; #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); - if (time_after(jiffies, last_warning + 5*HZ)) { - printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __func__); - last_warning = jiffies; - } + printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __func__); + while (ret == NULL) { yield(); - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); } } return ret; @@ -1726,10 +1907,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1742,17 +1922,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1761,10 +1940,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { @@ -1816,61 +1993,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1880,11 +2025,12 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* @@ -1902,7 +2048,7 @@ static void __init jbd_create_debugfs_entry(void) { jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, jbd_debugfs_dir, &journal_enable_debug); } @@ -1988,7 +2134,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif jbd_remove_debugfs_entry(); journal_destroy_caches(); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index db5e982c5dd..a748fe21465 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -20,7 +20,7 @@ #include <linux/fs.h> #include <linux/jbd.h> #include <linux/errno.h> -#include <linux/slab.h> +#include <linux/blkdev.h> #endif /* @@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start) { int err; unsigned int max, nbufs, next; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; struct buffer_head * bufs[MAXBUF]; @@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, unsigned int offset) { int err; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; *bhp = NULL; @@ -264,6 +264,12 @@ int journal_recover(journal_t *journal) err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; + /* Flush disk caches to get replayed data on the permanent storage */ + if (journal->j_flags & JFS_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } return err; } @@ -284,12 +290,9 @@ int journal_recover(journal_t *journal) int journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; - struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -298,11 +301,12 @@ int journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); -#endif + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); +#endif journal->j_transaction_sequence = ++info.end_transaction; } @@ -314,7 +318,7 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned int next_log_block; int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; @@ -322,11 +326,6 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log @@ -367,14 +366,14 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD: checking block %u\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -429,7 +428,7 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) <= journal->j_blocksize) { - unsigned long io_block; + unsigned int io_block; tag = (journal_block_tag_t *) tagp; flags = be32_to_cpu(tag->t_flags); @@ -443,10 +442,10 @@ static int do_one_pass(journal_t *journal, success = err; printk (KERN_ERR "JBD: IO error %d recovering " - "block %ld in log\n", + "block %u in log\n", err, io_block); } else { - unsigned long blocknr; + unsigned int blocknr; J_ASSERT(obh != NULL); blocknr = be32_to_cpu(tag->t_blocknr); @@ -581,7 +580,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, max = be32_to_cpu(header->r_count); while (offset < max) { - unsigned long blocknr; + unsigned int blocknr; int err; blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index da6cd9bdaab..8898bbd2b61 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -47,6 +47,10 @@ * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -71,7 +75,7 @@ * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * - * Finally, also replay code uses the hash tables but at this moment noone else + * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ @@ -101,7 +105,7 @@ struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ - unsigned long blocknr; + unsigned int blocknr; }; @@ -126,7 +130,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int); /* Utility functions to maintain the revoke table */ /* Borrowed from buffer.c: this is a tried and tested block hash function */ -static inline int hash(journal_t *journal, unsigned long block) +static inline int hash(journal_t *journal, unsigned int block) { struct jbd_revoke_table_s *table = journal->j_revoke; int hash_shift = table->hash_shift; @@ -136,7 +140,7 @@ static inline int hash(journal_t *journal, unsigned long block) (block << (hash_shift - 12))) & (table->hash_size - 1); } -static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, tid_t seq) { struct list_head *hash_list; @@ -166,7 +170,7 @@ oom: /* Find a revoke record in the journal's hash table. */ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long blocknr) + unsigned int blocknr) { struct list_head *hash_list; struct jbd_revoke_record_s *record; @@ -227,19 +231,15 @@ record_cache_failure: static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift = 0; - int tmp = hash_size; + int i; struct jbd_revoke_table_s *table; table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); if (!table) goto out; - while((tmp >>= 1UL) != 0UL) - shift++; - table->hash_size = hash_size; - table->hash_shift = shift; + table->hash_shift = ilog2(hash_size); table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { @@ -248,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) goto out; } - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&table->hash_table[tmp]); + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); out: return table; @@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal) * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, unsigned int blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, } } - jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -479,6 +479,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffer in the next + * transaction which is going to be started. + */ +void journal_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz @@ -617,7 +647,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); + write_dirty_buffer(bh, write_op); } #endif @@ -644,7 +674,7 @@ static void flush_descriptor(journal_t *journal, */ int journal_set_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -668,7 +698,7 @@ int journal_set_revoke(journal_t *journal, */ int journal_test_revoke(journal_t *journal, - unsigned long blocknr, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index ed886e6db39..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); @@ -206,7 +207,7 @@ repeat_locked: * the committing transaction. Really, we only need to give it * committing_transaction->t_outstanding_credits plus "enough" for * the log control blocks. - * Also, this test is inconsitent with the matching one in + * Also, this test is inconsistent with the matching one in * journal_extend(). */ if (__log_space_left(journal) < jbd_space_needed(journal)) { @@ -228,6 +229,8 @@ repeat_locked: __log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); @@ -242,7 +245,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -263,7 +265,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { @@ -290,12 +293,7 @@ handle_t *journal_start(journal_t *journal, int nblocks) jbd_free_handle(handle); current->journal_info = NULL; handle = ERR_PTR(err); - goto out; } - - lock_map_acquire(&handle->h_lockdep_map); - -out: return handle; } @@ -416,6 +414,7 @@ int journal_restart(handle_t *handle, int nblocks) __log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; @@ -426,17 +425,34 @@ int journal_restart(handle_t *handle, int nblocks) * void journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * - * This locks out any further updates from being started, and blocks - * until all existing updates have completed, returning only once the - * journal is in a quiescent state with no updates running. + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. * - * The journal lock should not be held on entry. + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. */ void journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -460,14 +476,6 @@ void journal_lock_updates(journal_t *journal) spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); - - /* - * We have now established a barrier against other normal updates, but - * we also need to barrier against other journal_lock_updates() calls - * to make sure that we serialise special journal-locked operations - * too. - */ - mutex_lock(&journal->j_barrier); } /** @@ -475,48 +483,26 @@ void journal_lock_updates(journal_t *journal) * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with journal_lock_updates(). - * - * Should be called without the journal lock held. */ void journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); - mutex_unlock(&journal->j_barrier); spin_lock(&journal->j_state_lock); --journal->j_barrier_count; spin_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); + char b[BDEVNAME_SIZE]; - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -546,7 +532,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -583,14 +569,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -687,7 +675,7 @@ repeat: jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -713,7 +701,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -729,10 +716,10 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; - source = kmap_atomic(page, KM_USER0); + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source, KM_USER0); + kunmap_atomic(source); } jbd_unlock_bh_state(bh); @@ -826,7 +813,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { - jh->b_transaction = transaction; + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); /* first access by this transaction */ jh->b_modified = 0; @@ -852,8 +847,8 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } @@ -903,7 +898,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1077,8 +1072,9 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ret = -EIO; goto no_journal; } - - if (jh->b_transaction != NULL) { + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1099,8 +1095,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1264,7 +1258,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } - /* keep track of wether or not this transaction modified us */ + /* keep track of whether or not this transaction modified us */ was_modified = jh->b_modified; /* @@ -1308,8 +1302,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1401,11 +1393,11 @@ int journal_stop(handle_t *handle) * by 30x or more... * * We try and optimize the sleep time against what the underlying disk - * can do, instead of having a static sleep time. This is usefull for + * can do, instead of having a static sleep time. This is useful for * the case where our storage is so fast that it is more optimal to go * ahead and force a flush and wait for the transaction to be committed * than it is to wait for an arbitrary amount of time for new writers to - * join the transaction. We acheive this by measuring how long it takes + * join the transaction. We achieve this by measuring how long it takes * to commit a transaction, and compare it with how long this * transaction has been running, and if run time < commit time then we * sleep for the delta and commit. This greatly helps super fast disks @@ -1440,8 +1432,6 @@ int journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); @@ -1630,19 +1620,32 @@ static void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1669,16 +1672,12 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1686,35 +1685,6 @@ out: return; } -/* - * journal_try_to_free_buffers() could race with journal_commit_transaction() - * The latter might still hold the a count on buffers when inspecting - * them on t_syncdata_list or t_locked_list. - * - * journal_try_to_free_buffers() will call this function to - * wait for the current transaction to finish syncing data buffers, before - * tryinf to free that buffer. - * - * Called with journal->j_state_lock held. - */ -static void journal_wait_for_transaction_sync_data(journal_t *journal) -{ - transaction_t *transaction = NULL; - tid_t tid; - - spin_lock(&journal->j_state_lock); - transaction = journal->j_committing_transaction; - - if (!transaction) { - spin_unlock(&journal->j_state_lock); - return; - } - - tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); - log_wait_commit(journal, tid); -} - /** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -1770,7 +1740,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1786,25 +1756,6 @@ int journal_try_to_free_buffers(journal_t *journal, ret = try_to_free_buffers(page); - /* - * There are a number of places where journal_try_to_free_buffers() - * could race with journal_commit_transaction(), the later still - * holds the reference to the buffers to free while processing them. - * try_to_free_buffers() failed to free those buffers. Some of the - * caller of releasepage() request page buffers to be dropped, otherwise - * treat the fail-to-free as errors (such as generic_file_direct_IO()) - * - * So, if the caller of try_to_release_page() wants the synchronous - * behaviour(i.e make sure buffers are dropped upon return), - * let's wait for the current transaction to finish flush of - * dirty data buffers, then try to free those buffers again, - * with the journal locked. - */ - if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { - journal_wait_for_transaction_sync_data(journal); - ret = try_to_free_buffers(page); - } - busy: return ret; } @@ -1826,17 +1777,20 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } @@ -1888,15 +1842,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1914,6 +1869,29 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1939,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1953,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1979,16 +1949,31 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ - set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); + log_wait_commit(journal, tid); + lock_buffer(bh); + goto retry; } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -2007,6 +1992,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -2026,16 +2019,20 @@ zap_buffer_unlocked: * void journal_invalidatepage() - invalidate a journal page * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. + * Reap page buffers containing data in specified range in page. */ void journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) @@ -2043,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2052,10 +2051,14 @@ void journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -2063,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } @@ -2089,12 +2092,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; @@ -2102,6 +2110,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2159,13 +2169,14 @@ void journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2185,10 +2196,20 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, - jh->b_modified ? BJ_Metadata : BJ_Reserved); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) @@ -2196,30 +2217,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } |
