diff options
Diffstat (limited to 'fs/jbd')
| -rw-r--r-- | fs/jbd/Kconfig | 30 | ||||
| -rw-r--r-- | fs/jbd/checkpoint.c | 589 | ||||
| -rw-r--r-- | fs/jbd/commit.c | 545 | ||||
| -rw-r--r-- | fs/jbd/journal.c | 833 | ||||
| -rw-r--r-- | fs/jbd/recovery.c | 117 | ||||
| -rw-r--r-- | fs/jbd/revoke.c | 347 | ||||
| -rw-r--r-- | fs/jbd/transaction.c | 662 |
7 files changed, 1904 insertions, 1219 deletions
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig new file mode 100644 index 00000000000..4e28beeed15 --- /dev/null +++ b/fs/jbd/Kconfig @@ -0,0 +1,30 @@ +config JBD + tristate + help + This is a generic journalling layer for block devices. It is + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block + devices such as RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD && DEBUG_FS + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a + number between 1 and 5, the higher the number, the more debugging + output is generated. To turn debugging off again, do + "echo 0 > /sys/kernel/debug/jbd/jbd-debug". diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 014a51fd00d..08c03044abd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -1,6 +1,6 @@ /* - * linux/fs/checkpoint.c - * + * linux/fs/jbd/checkpoint.c + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999 Red Hat Software --- All Rights Reserved @@ -9,8 +9,8 @@ * the terms of the GNU General Public License, version 2, or at your * option, any later version, incorporated herein by reference. * - * Checkpoint routines for the generic filesystem journaling code. - * Part of the ext2fs journaling system. + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. * * Checkpointing is the process of ensuring that a section of the log is * committed fully to disk, so that that portion of the log can be @@ -22,31 +22,71 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* - * Unlink a buffer from a transaction. + * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. */ - -static inline void __buffer_unlink(struct journal_head *jh) +static inline void __buffer_unlink_first(struct journal_head *jh) { - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - jh->b_cp_transaction = NULL; + transaction_t *transaction = jh->b_cp_transaction; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) + if (transaction->t_checkpoint_list == jh) { transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -55,14 +95,18 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); - __journal_remove_checkpoint(jh); + ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); - ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -77,7 +121,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh) */ void __log_wait_for_space(journal_t *journal) { - int nblocks; + int nblocks, space_left; assert_spin_locked(&journal->j_state_lock); nblocks = jbd_space_needed(journal); @@ -85,20 +129,52 @@ void __log_wait_for_space(journal_t *journal) if (journal->j_flags & JFS_ABORT) return; spin_unlock(&journal->j_state_lock); - down(&journal->j_checkpoint_sem); + mutex_lock(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock + * were waiting for the checkpoint lock. If there are no + * transactions ready to be checkpointed, try to recover + * journal space by calling cleanup_journal_tail(), and if + * that doesn't work, by waiting for the currently committing + * transaction to complete. If there is absolutely no way + * to make progress, this is either a BUG or corrupted + * filesystem, so abort the journal and leave a stack + * trace for forensic evidence. */ spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); - if (__log_space_left(journal) < nblocks) { + space_left = __log_space_left(journal); + if (space_left < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + tid_t tid = 0; + + if (journal->j_committing_transaction) + tid = journal->j_committing_transaction->t_tid; + spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); - log_do_checkpoint(journal); + if (chkpt) { + log_do_checkpoint(journal); + } else if (cleanup_journal_tail(journal) == 0) { + /* We were able to recover space; yay! */ + ; + } else if (tid) { + log_wait_commit(journal, tid); + } else { + printk(KERN_ERR "%s: needed %d blocks and " + "only had %d space available\n", + __func__, nblocks, space_left); + printk(KERN_ERR "%s: no way to get more " + "journal space\n", __func__); + WARN_ON(1); + journal_abort(journal, 0); + } spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); } - up(&journal->j_checkpoint_sem); + mutex_unlock(&journal->j_checkpoint_mutex); } } @@ -108,6 +184,7 @@ void __log_wait_for_space(journal_t *journal) * jbd_unlock_bh_state(). */ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) + __releases(journal->j_list_lock) { get_bh(bh); spin_unlock(&journal->j_list_lock); @@ -117,83 +194,62 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up a transaction's checkpoint list. + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. * - * We wait for any pending IO to complete and make sure any clean - * buffers are removed from the transaction. - * - * Return 1 if we performed any actions which might have destroyed the - * checkpoint. (journal_remove_checkpoint() deletes the transaction when - * the last checkpoint buffer is cleansed) + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. * * Called with j_list_lock held. */ -static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh, *next_jh, *last_jh; + struct journal_head *jh; struct buffer_head *bh; + tid_t this_tid; + int released = 0; int ret = 0; - assert_spin_locked(&journal->j_list_lock); - jh = transaction->t_checkpoint_list; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - next_jh = jh; - do { - jh = next_jh; + this_tid = transaction->t_tid; +restart: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return ret; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } + get_bh(bh); if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - goto out_return_1; - } - - /* - * This is foul - */ - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - goto out_return_1; - } - - if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - goto out_return_1; + spin_lock(&journal->j_list_lock); + goto restart; } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; /* - * AKPM: I think the buffer_jbddirty test is redundant - it - * shouldn't have NULL b_transaction? + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list */ - next_jh = jh->b_cpnext; - if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - __brelse(bh); - ret = 1; - } else { - jbd_unlock_bh_state(bh); - } - } while (jh != last_jh); + released = __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + __brelse(bh); + } return ret; -out_return_1: - spin_lock(&journal->j_list_lock); - return 1; } #define NR_BATCH 64 @@ -202,10 +258,13 @@ static void __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + struct blk_plug plug; + + blk_start_plug(&plug); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(bhs[i], WRITE_SYNC); + blk_finish_plug(&plug); - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, *batch_count, bhs); - spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -219,160 +278,171 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. * - * Called with j_list_lock held. + * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __flush_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count, - int *drop_count) +static int __process_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { - J_ASSERT_JH(jh, jh->b_transaction == NULL); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + ret = 1; + } else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + ret = 1; + } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + get_bh(bh); + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); + } else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. * We cannot afford to let the transaction logic start * messing around with this buffer before we write it to - * disk, as that would break recoverability. + * disk, as that would break recoverability. */ BUFFER_TRACE(bh, "queue"); get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; + __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { + spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } - } else { - int last_buffer = 0; - if (jh->b_cpnext == jh) { - /* We may be about to drop the transaction. Tell the - * caller that the lists have changed. - */ - last_buffer = 1; - } - if (__try_to_free_cp_buf(jh)) { - (*drop_count)++; - ret = last_buffer; - } } return ret; } /* - * Perform an actual checkpoint. We don't write out only enough to - * satisfy the current blocked requests: rather we submit a reasonably - * sized chunk of the outstanding data to disk at once for - * efficiency. __log_wait_for_space() will retry if we didn't free enough. - * - * However, we _do_ take into account the amount requested so that once - * the IO has been queued, we can return as soon as enough of it has - * completed to disk. + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. */ int log_do_checkpoint(journal_t *journal) { + transaction_t *transaction; + tid_t this_tid; int result; - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); - /* + /* * First thing: if there are any transactions in the log which * don't need checkpointing, just eliminate them from the - * journal straight away. + * journal straight away. */ result = cleanup_journal_tail(journal); + trace_jbd_checkpoint(journal, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; /* - * OK, we need to start writing disk blocks. Try to free up a - * quarter of the log in a single checkpoint if we can. + * OK, we need to start writing disk blocks. Take one transaction + * and write it. */ + result = 0; + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; +restart: /* - * AKPM: check this code. I had a feeling a while back that it - * degenerates into a busy loop at unmount time. + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). */ - spin_lock(&journal->j_list_lock); - while (journal->j_checkpoint_transactions) { - transaction_t *transaction; - struct journal_head *jh, *last_jh, *next_jh; - int drop_count = 0; - int cleanup_ret, retry = 0; - tid_t this_tid; - - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; - jh = transaction->t_checkpoint_list; - last_jh = jh->b_cpprev; - next_jh = jh; - do { + if (journal->j_checkpoint_transactions == transaction && + transaction->t_tid == this_tid) { + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + struct journal_head *jh; + int retry = 0, err; + + while (!retry && transaction->t_checkpoint_list) { struct buffer_head *bh; - jh = next_jh; - next_jh = jh->b_cpnext; + jh = transaction->t_checkpoint_list; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); - if (cond_resched_lock(&journal->j_list_lock)) { + retry = __process_buffer(journal, jh, bhs,&batch_count); + if (retry < 0 && !result) + result = retry; + if (!retry && (need_resched() || + spin_needbreak(&journal->j_list_lock))) { + spin_unlock(&journal->j_list_lock); retry = 1; break; } - } while (jh != last_jh && !retry); + } if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } __flush_batch(journal, bhs, &batch_count); - retry = 1; } + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } /* - * If someone cleaned up this transaction while we slept, we're - * done - */ - if (journal->j_checkpoint_transactions != transaction) - break; - if (retry) - continue; - /* - * Maybe it's a new transaction, but it fell at the same - * address - */ - if (transaction->t_tid != this_tid) - continue; - /* - * We have walked the whole transaction list without - * finding anything to write to disk. We had better be - * able to make some progress or we are in trouble. + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one */ - cleanup_ret = __cleanup_transaction(journal, transaction); - J_ASSERT(drop_count != 0 || cleanup_ret != 0); - if (journal->j_checkpoint_transactions != transaction) - break; + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; } +out: spin_unlock(&journal->j_list_lock); - result = cleanup_journal_tail(journal); if (result < 0) - return result; + journal_abort(journal, result); + else + result = cleanup_journal_tail(journal); - return 0; + return (result < 0) ? result : 0; } /* @@ -380,31 +450,34 @@ int log_do_checkpoint(journal_t *journal) * we have already got rid of any since the last update of the log tail * in the journal superblock. If so, we can instantly roll the * superblock forward to remove those transactions from the log. - * + * * Return <0 on error, 0 on success, 1 if there was nothing to clean up. - * - * Called with the journal lock held. * * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the journal superblock if - * we have an abort error outstanding. + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. */ int cleanup_journal_tail(journal_t *journal) { transaction_t * transaction; tid_t first_tid; - unsigned long blocknr, freed; + unsigned int blocknr, freed; + + if (is_journal_aborted(journal)) + return 1; - /* OK, work out the oldest transaction remaining in the log, and - * the log block it starts at. - * + /* + * OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * * If the log is now empty, we need to work out which is the * next transaction ID we will write, and where it will - * start. */ - + * start. + */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; @@ -430,7 +503,24 @@ int cleanup_journal_tail(journal_t *journal) spin_unlock(&journal->j_state_lock); return 1; } + spin_unlock(&journal->j_state_lock); + /* + * We need to make sure that any blocks that were recently written out + * --- perhaps by log_do_checkpoint() --- are flushed out before we + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. + */ + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); + + spin_lock(&journal->j_state_lock); /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -438,17 +528,16 @@ int cleanup_journal_tail(journal_t *journal) if (blocknr < journal->j_tail) freed = freed + journal->j_last - journal->j_first; + trace_jbd_cleanup_journal_tail(journal, first_tid, blocknr, freed); jbd_debug(1, - "Cleaning journal tail from %d to %d (offset %lu), " - "freeing %lu\n", + "Cleaning journal tail from %d to %d (offset %u), " + "freeing %u\n", journal->j_tail_sequence, first_tid, blocknr, freed); journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); return 0; } @@ -456,58 +545,104 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ /* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release + * them. + * + * Called with j_list_lock held. + * Returns number of buffers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; int ret = 0; + int released; transaction = journal->j_checkpoint_transactions; - if (transaction == 0) + if (!transaction) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { - struct journal_head *jh; - transaction = next_transaction; next_transaction = transaction->t_cpnext; - jh = transaction->t_checkpoint_list; - if (jh) { - struct journal_head *last_jh = jh->b_cpprev; - struct journal_head *next_jh = jh; - - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranknig */ - if (jbd_trylock_bh_state(jh2bh(jh))) - ret += __try_to_free_cp_buf(jh); - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - goto out; - } while (jh != last_jh); - } + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; } while (transaction != last_transaction); out: return ret; } -/* +/* * journal_remove_checkpoint: called after a buffer has been committed * to disk (either by being write-back flushed to disk, or being * committed to the log). @@ -516,18 +651,22 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * list until they have been rewritten, at which point this function is + * lists until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint list. + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -void __journal_remove_checkpoint(struct journal_head *jh) +int __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; + int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -537,25 +676,26 @@ void __journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); + jh->b_cp_transaction = NULL; + journal_put_journal_head(jh); - if (transaction->t_checkpoint_list != NULL) + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the - * buffer off a committing transaction's forget list, then even if the - * checkpoint list is empty, the transaction obviously cannot be - * dropped! + * buffer off a running or committing transaction's checkpoing list, + * then even if the checkpoint list is empty, the transaction obviously + * cannot be dropped! * - * The locking here around j_committing_transaction is a bit sleazy. + * The locking here around t_state is a bit sleazy. * See the comment at the end of journal_commit_transaction(). */ - if (transaction == journal->j_committing_transaction) { - JBUFFER_TRACE(jh, "belongs to committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -565,8 +705,9 @@ void __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); + ret = 1; out: - JBUFFER_TRACE(jh, "exit"); + return ret; } /* @@ -577,13 +718,15 @@ out: * Called with the journal locked. * Called with j_list_lock held. */ -void __journal_insert_checkpoint(struct journal_head *jh, +void __journal_insert_checkpoint(struct journal_head *jh, transaction_t *transaction) { JBUFFER_TRACE(jh, "entry"); J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { @@ -599,7 +742,7 @@ void __journal_insert_checkpoint(struct journal_head *jh, /* * We've finished with this transaction structure: adios... - * + * * The transaction must have no links except for the checkpoint by this * point. * @@ -628,10 +771,12 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); + trace_jbd_drop_transaction(journal, transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); kfree(transaction); } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 2a3e310f79e..bb217dcb41a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -1,5 +1,5 @@ /* - * linux/fs/commit.c + * linux/fs/jbd/commit.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * @@ -17,10 +17,11 @@ #include <linux/fs.h> #include <linux/jbd.h> #include <linux/errno.h> -#include <linux/slab.h> #include <linux/mm.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <trace/events/jbd.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -37,7 +38,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) /* * When an ext3-ordered file is truncated, it is possible that many pages are - * not sucessfully freed, because they are attached to a committing transaction. + * not successfully freed, because they are attached to a committing transaction. * After the transaction commits, these pages are left on the LRU, with no * ->mapping, and with attached buffers. These pages are trivially reclaimable * by the VM, but their apparent absence upsets the VM accounting, and it makes @@ -46,8 +47,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under journal->j_list_lock. The caller provided us with a ref + * against the buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -64,7 +65,7 @@ static void release_buffer_page(struct buffer_head *bh) goto nope; /* OK, it's a truncated page */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto nope; page_cache_get(page); @@ -79,6 +80,24 @@ nope: } /* + * Decrement reference counter for data buffer. If it has been marked + * 'BH_Freed', release it and the page to which it belongs if possible. + */ +static void release_data_buffer(struct buffer_head *bh) +{ + if (buffer_freed(bh)) { + WARN_ON_ONCE(buffer_dirty(bh)); + clear_buffer_freed(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + release_buffer_page(bh); + } else + put_bh(bh); +} + +/* * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is * held. For ranking reasons we must trylock. If we lose, schedule away and * return 0. j_list_lock is dropped in this case. @@ -105,8 +124,8 @@ static int journal_write_commit_record(journal_t *journal, { struct journal_head *descriptor; struct buffer_head *bh; - int i, ret; - int barrier_done = 0; + journal_header_t *header; + int ret; if (is_journal_aborted(journal)) return 0; @@ -117,49 +136,157 @@ static int journal_write_commit_record(journal_t *journal, bh = jh2bh(descriptor); - /* AKPM: buglet - add `i' to tmp! */ - for (i = 0; i < bh->b_size; i += 512) { - journal_header_t *tmp = (journal_header_t*)bh->b_data; - tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); - tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); - tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); - } + header = (journal_header_t *)(bh->b_data); + header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); + header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK); + header->h_sequence = cpu_to_be32(commit_transaction->t_tid); JBUFFER_TRACE(descriptor, "write commit block"); set_buffer_dirty(bh); - if (journal->j_flags & JFS_BARRIER) { - set_buffer_ordered(bh); - barrier_done = 1; - } - ret = sync_dirty_buffer(bh); - /* is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want - * to remember if we sent a barrier request - */ - if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); - spin_lock(&journal->j_state_lock); - journal->j_flags &= ~JFS_BARRIER; - spin_unlock(&journal->j_state_lock); - /* And try again, without the barrier */ - clear_buffer_ordered(bh); - set_buffer_uptodate(bh); - set_buffer_dirty(bh); + if (journal->j_flags & JFS_BARRIER) + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA); + else ret = sync_dirty_buffer(bh); - } + put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); return (ret == -EIO); } +static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, + int write_op) +{ + int i; + + for (i = 0; i < bufs; i++) { + wbuf[i]->b_end_io = end_buffer_write_sync; + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); + } +} + +/* + * Submit all the data buffers to disk + */ +static int journal_submit_data_buffers(journal_t *journal, + transaction_t *commit_transaction, + int write_op) +{ + struct journal_head *jh; + struct buffer_head *bh; + int locked; + int bufs = 0; + struct buffer_head **wbuf = journal->j_wbuf; + int err = 0; + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_sync_datalist, so we have to keep looping back to + * write_out_data until we *know* that the list is empty. + * + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + */ +write_out_data: + cond_resched(); + spin_lock(&journal->j_list_lock); + + while (commit_transaction->t_sync_datalist) { + jh = commit_transaction->t_sync_datalist; + bh = jh2bh(jh); + locked = 0; + + /* Get reference just to make sure buffer does not disappear + * when we are forced to drop various locks */ + get_bh(bh); + /* If the buffer is dirty, we need to submit IO and hence + * we need the buffer lock. We try to lock the buffer without + * blocking. If we fail, we need to drop j_list_lock and do + * blocking lock_buffer(). + */ + if (buffer_dirty(bh)) { + if (!trylock_buffer(bh)) { + BUFFER_TRACE(bh, "needs blocking lock"); + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); + /* Write out all data to prevent deadlocks */ + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + lock_buffer(bh); + spin_lock(&journal->j_list_lock); + } + locked = 1; + } + /* We have to get bh_state lock. Again out of order, sigh. */ + if (!inverted_lock(journal, bh)) { + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + } + /* Someone already cleaned up the buffer? */ + if (!buffer_jbd(bh) || bh2jh(bh) != jh + || jh->b_transaction != commit_transaction + || jh->b_jlist != BJ_SyncData) { + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + BUFFER_TRACE(bh, "already cleaned up"); + release_data_buffer(bh); + continue; + } + if (locked && test_clear_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "needs writeout, adding to array"); + wbuf[bufs++] = bh; + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + if (bufs == journal->j_wbufsize) { + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, + commit_transaction); + journal_do_submit_data(wbuf, bufs, write_op); + bufs = 0; + goto write_out_data; + } + } else if (!locked && buffer_locked(bh)) { + __journal_file_buffer(jh, commit_transaction, + BJ_Locked); + jbd_unlock_bh_state(bh); + put_bh(bh); + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + if (unlikely(!buffer_uptodate(bh))) + err = -EIO; + __journal_unfile_buffer(jh); + jbd_unlock_bh_state(bh); + if (locked) + unlock_buffer(bh); + release_data_buffer(bh); + } + + if (need_resched() || spin_needbreak(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); + goto write_out_data; + } + } + spin_unlock(&journal->j_list_lock); + trace_jbd_do_submit_data(journal, commit_transaction); + journal_do_submit_data(wbuf, bufs, write_op); + + return err; +} + /* * journal_commit_transaction * @@ -174,7 +301,9 @@ void journal_commit_transaction(journal_t *journal) int bufs; int flags; int err; - unsigned long blocknr; + unsigned int blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; @@ -182,22 +311,27 @@ void journal_commit_transaction(journal_t *journal) int first_tag = 0; int tag_flag; int i; + struct blk_plug plug; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -206,14 +340,16 @@ void journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; + trace_jbd_commit_locking(journal, commit_transaction); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); @@ -248,7 +384,7 @@ void journal_commit_transaction(journal_t *journal) * we do not require it to remember exactly which old buffers it * has reserved. This is consistent with the existing behaviour * that multiple journal_get_write_access() calls to the same - * buffer are perfectly permissable. + * buffer are perfectly permissible. */ while (commit_transaction->t_reserved_list) { jh = commit_transaction->t_reserved_list; @@ -261,10 +397,8 @@ void journal_commit_transaction(journal_t *journal) struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); - if (jh->b_committed_data) { - kfree(jh->b_committed_data); - jh->b_committed_data = NULL; - } + jbd_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } journal_refile_buffer(journal, jh); @@ -282,113 +416,43 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 1\n"); /* + * Clear revoked flag to reflect there is no revoked buffers + * in the next transaction which is going to be started. + */ + journal_clear_buffer_revoked_flags(journal); + + /* * Switch to a new revoke table. */ journal_switch_revoke_table(journal); + trace_jbd_commit_flushing(journal, commit_transaction); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); - /* - * First, drop modified flag: all accesses to the buffers - * will be tracked for a new trasaction only -bzzz - */ - spin_lock(&journal->j_list_lock); - if (commit_transaction->t_buffers) { - new_jh = jh = commit_transaction->t_buffers->b_tnext; - do { - J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || - new_jh->b_modified == 0); - new_jh->b_modified = 0; - new_jh = new_jh->b_tnext; - } while (new_jh != jh); - } - spin_unlock(&journal->j_list_lock); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ - - err = 0; - /* - * Whenever we unlock the journal and sleep, things can get added - * onto ->t_sync_datalist, so we have to keep looping back to - * write_out_data until we *know* that the list is empty. - */ - bufs = 0; - /* - * Cleanup any flushed data buffers from the data list. Even in - * abort mode, we want to flush this out as soon as possible. - */ -write_out_data: - cond_resched(); - spin_lock(&journal->j_list_lock); - - while (commit_transaction->t_sync_datalist) { - struct buffer_head *bh; - - jh = commit_transaction->t_sync_datalist; - commit_transaction->t_sync_datalist = jh->b_tnext; - bh = jh2bh(jh); - if (buffer_locked(bh)) { - BUFFER_TRACE(bh, "locked"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_temp_unlink_buffer(jh); - __journal_file_buffer(jh, commit_transaction, - BJ_Locked); - jbd_unlock_bh_state(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } else { - if (buffer_dirty(bh)) { - BUFFER_TRACE(bh, "start journal writeout"); - get_bh(bh); - wbuf[bufs++] = bh; - if (bufs == journal->j_wbufsize) { - jbd_debug(2, "submit %d writes\n", - bufs); - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - bufs = 0; - goto write_out_data; - } - } else { - BUFFER_TRACE(bh, "writeout complete: unfile"); - if (!inverted_lock(journal, bh)) - goto write_out_data; - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - if (lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); - goto write_out_data; - } - } - } - } - - if (bufs) { - spin_unlock(&journal->j_list_lock); - ll_rw_block(SWRITE, bufs, wbuf); - journal_brelse_array(wbuf, bufs); - spin_lock(&journal->j_list_lock); - } + blk_start_plug(&plug); + err = journal_submit_data_buffers(journal, commit_transaction, + write_op); + blk_finish_plug(&plug); /* * Wait for all previously submitted IO to complete. */ + spin_lock(&journal->j_list_lock); while (commit_transaction->t_locked_list) { struct buffer_head *bh; @@ -398,34 +462,50 @@ write_out_data: if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); wait_on_buffer(bh); - if (unlikely(!buffer_uptodate(bh))) - err = -EIO; spin_lock(&journal->j_list_lock); } + if (unlikely(!buffer_uptodate(bh))) { + if (!trylock_page(bh->b_page)) { + spin_unlock(&journal->j_list_lock); + lock_page(bh->b_page); + spin_lock(&journal->j_list_lock); + } + if (bh->b_page->mapping) + set_bit(AS_EIO, &bh->b_page->mapping->flags); + + unlock_page(bh->b_page); + SetPageError(bh->b_page); + err = -EIO; + } if (!inverted_lock(journal, bh)) { put_bh(bh); spin_lock(&journal->j_list_lock); continue; } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - } else { - jbd_unlock_bh_state(bh); - } - put_bh(bh); + jbd_unlock_bh_state(bh); + release_data_buffer(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); - if (err) - __journal_abort_hard(journal); + if (err) { + char b[BDEVNAME_SIZE]; + + printk(KERN_WARNING + "JBD: Detected IO errors while flushing file data " + "on %s\n", bdevname(journal->j_fs_dev, b)); + if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR) + journal_abort(journal, err); + err = 0; + } - journal_write_revoke_records(journal, commit_transaction); + blk_start_plug(&plug); - jbd_debug(3, "JBD: commit phase 2\n"); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have @@ -442,7 +522,13 @@ write_out_data: * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ + spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; + spin_unlock(&journal->j_state_lock); + + trace_jbd_commit_logging(journal, commit_transaction); + J_ASSERT(commit_transaction->t_nr_buffers <= + commit_transaction->t_outstanding_credits); descriptor = NULL; bufs = 0; @@ -453,9 +539,10 @@ write_out_data: jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -479,7 +566,7 @@ write_out_data: descriptor = journal_get_descriptor_buffer(journal); if (!descriptor) { - __journal_abort_hard(journal); + journal_abort(journal, -EIO); continue; } @@ -512,7 +599,7 @@ write_out_data: and repeat this loop: we'll fall into the refile-on-abort condition above. */ if (err) { - __journal_abort_hard(journal); + journal_abort(journal, err); continue; } @@ -526,13 +613,13 @@ write_out_data: /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get rid of the BJ_IO/BJ_Shadow pairing of buffers. */ - atomic_inc(&jh2bh(jh)->b_count); + get_bh(jh2bh(jh)); /* Make a temporary IO buffer with which to write it out (this will requeue both the metadata buffer and the temporary IO buffer). new_bh goes on BJ_IO*/ - set_bit(BH_JWrite, &jh2bh(jh)->b_state); + set_buffer_jwrite(jh2bh(jh)); /* * akpm: journal_write_metadata_buffer() sets * new_bh->b_transaction to commit_transaction. @@ -542,7 +629,7 @@ write_out_data: JBUFFER_TRACE(jh, "ph3: write metadata"); flags = journal_write_metadata_buffer(commit_transaction, jh, &new_jh, blocknr); - set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + set_buffer_jwrite(jh2bh(new_jh)); wbuf[bufs++] = jh2bh(new_jh); /* Record the new block's tag in the current descriptor @@ -589,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); @@ -600,6 +697,8 @@ start_journal_io: } } + blk_finish_plug(&plug); + /* Lo and behold: we have just managed to send a transaction to the log. Before we can commit it, wait for the IO so far to complete. Control buffers being written are on the @@ -652,7 +751,7 @@ wait_for_iobuf: shadowed buffer */ jh = commit_transaction->t_shadow_list->b_tprev; bh = jh2bh(jh); - clear_bit(BH_JWrite, &bh->b_state); + clear_buffer_jwrite(bh); J_ASSERT_BH(bh, buffer_jbddirty(bh)); /* The metadata is now released for reuse, but we need @@ -661,8 +760,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); @@ -697,13 +801,22 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 6\n"); + /* All metadata is written, now write commit record and do cleanup */ + spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_COMMIT_RECORD; + spin_unlock(&journal->j_state_lock); + if (journal_write_commit_record(journal, commit_transaction)) err = -EIO; if (err) - __journal_abort_hard(journal); + journal_abort(journal, err); /* End of a transaction! Finally, we can do checkpoint processing: any buffers committed as a result of this @@ -728,10 +841,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); @@ -747,14 +866,14 @@ restart_loop: * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { - kfree(jh->b_committed_data); + jbd_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { - kfree(jh->b_frozen_data); + jbd_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } @@ -771,41 +890,71 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future now that the last use has - * been committed. That's not only a performance gain, - * it also stops aliasing problems if the buffer is left - * behind for writeback and gets reallocated for another - * use in a different page. */ + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ if (buffer_freed(bh)) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather throughout in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - __journal_unfile_buffer(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); /* needs a brelse */ - release_buffer_page(bh); + /* + * The buffer on BJ_Forget list and not jbddirty means + * it has been freed by this transaction and hence it + * could not have been reallocated until this + * transaction has committed. *BUT* it could be + * reallocated once we have written all the data to + * disk and before we process the buffer on BJ_Forget + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile freed buffer"); + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); /* - * This is a bit sleazy. We borrow j_list_lock to protect - * journal->j_committing_transaction in __journal_remove_checkpoint. - * Really, __journal_remove_checkpoint should be using j_state_lock but - * it's a bit hassle to hold that across __journal_remove_checkpoint + * This is a bit sleazy. We use j_list_lock to protect transition + * of a transaction into T_FINISHED state and calling + * __journal_drop_transaction(). Otherwise we could race with + * other checkpointing code processing the transaction... */ spin_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); @@ -823,15 +972,28 @@ restart_loop: jbd_debug(3, "JBD: commit phase 8\n"); - J_ASSERT(commit_transaction->t_state == T_COMMIT); + J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD); commit_transaction->t_state = T_FINISHED; J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time*3 + + journal->j_average_commit_time) / 4; + else + journal->j_average_commit_time = commit_time; + spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL && + commit_transaction->t_checkpoint_io_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { @@ -851,6 +1013,7 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + trace_jbd_end_commit(journal, commit_transaction); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index e4b516ac498..06fe11e0abf 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1,5 +1,5 @@ /* - * linux/fs/journal.c + * linux/fs/jbd/journal.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * @@ -28,14 +28,21 @@ #include <linux/jbd.h> #include <linux/errno.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/mm.h> -#include <linux/suspend.h> +#include <linux/freezer.h> #include <linux/pagemap.h> +#include <linux/kthread.h> +#include <linux/poison.h> +#include <linux/proc_fs.h> +#include <linux/debugfs.h> +#include <linux/ratelimit.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/jbd.h> + #include <asm/uaccess.h> #include <asm/page.h> -#include <linux/proc_fs.h> EXPORT_SYMBOL(journal_start); EXPORT_SYMBOL(journal_restart); @@ -65,12 +72,12 @@ EXPORT_SYMBOL(journal_set_features); EXPORT_SYMBOL(journal_create); EXPORT_SYMBOL(journal_load); EXPORT_SYMBOL(journal_destroy); -EXPORT_SYMBOL(journal_update_superblock); EXPORT_SYMBOL(journal_abort); EXPORT_SYMBOL(journal_errno); EXPORT_SYMBOL(journal_ack_err); EXPORT_SYMBOL(journal_clear_err); EXPORT_SYMBOL(log_wait_commit); +EXPORT_SYMBOL(log_start_commit); EXPORT_SYMBOL(journal_start_commit); EXPORT_SYMBOL(journal_force_commit_nested); EXPORT_SYMBOL(journal_wipe); @@ -81,6 +88,25 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static const char *journal_dev_name(journal_t *journal, char *buffer); + +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif /* * Helper function used to manage commit timeouts @@ -111,18 +137,17 @@ static void commit_timeout(unsigned long __data) static int kjournald(void *arg) { - journal_t *journal = (journal_t *) arg; + journal_t *journal = arg; transaction_t *transaction; - struct timer_list timer; - daemonize("kjournald"); + /* + * Set up an interval timer which can be used to trigger a commit wakeup + * after the commit interval expires + */ + setup_timer(&journal->j_commit_timer, commit_timeout, + (unsigned long)current); - /* Set up an interval timer which can be used to trigger a - commit wakeup after the commit interval expires */ - init_timer(&timer); - timer.data = (unsigned long) current; - timer.function = commit_timeout; - journal->j_commit_timer = &timer; + set_freezable(); /* Record that the journal thread is running */ journal->j_task = current; @@ -146,7 +171,7 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); spin_unlock(&journal->j_state_lock); - del_timer_sync(journal->j_commit_timer); + del_timer_sync(&journal->j_commit_timer); journal_commit_transaction(journal); spin_lock(&journal->j_state_lock); goto loop; @@ -161,7 +186,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(); + try_to_freeze(); spin_lock(&journal->j_state_lock); } else { /* @@ -180,7 +205,7 @@ loop: transaction->t_expires)) should_sleep = 0; if (journal->j_flags & JFS_UNMOUNT) - should_sleep = 0; + should_sleep = 0; if (should_sleep) { spin_unlock(&journal->j_state_lock); schedule(); @@ -203,17 +228,23 @@ loop: end_loop: spin_unlock(&journal->j_state_lock); - del_timer_sync(journal->j_commit_timer); + del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd_debug(1, "Journal thread exiting.\n"); return 0; } -static void journal_start_thread(journal_t *journal) +static int journal_start_thread(journal_t *journal) { - kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES); - wait_event(journal->j_wait_done_commit, journal->j_task != 0); + struct task_struct *t; + + t = kthread_run(kjournald, journal, "kjournald"); + if (IS_ERR(t)) + return PTR_ERR(t); + + wait_event(journal->j_wait_done_commit, journal->j_task != NULL); + return 0; } static void journal_kill_thread(journal_t *journal) @@ -224,7 +255,8 @@ static void journal_kill_thread(journal_t *journal) while (journal->j_task) { wake_up(&journal->j_wait_commit); spin_unlock(&journal->j_state_lock); - wait_event(journal->j_wait_done_commit, journal->j_task == 0); + wait_event(journal->j_wait_done_commit, + journal->j_task == NULL); spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); @@ -270,7 +302,7 @@ static void journal_kill_thread(journal_t *journal) int journal_write_metadata_buffer(transaction_t *transaction, struct journal_head *jh_in, struct journal_head **jh_out, - int blocknr) + unsigned int blocknr) { int need_copy_out = 0; int done_copy_out = 0; @@ -281,6 +313,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -294,6 +327,9 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -310,7 +346,7 @@ repeat: new_offset = offset_in_page(jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); /* * Check for escaping */ @@ -319,7 +355,7 @@ repeat: need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); /* * Do we need to do a data copy? @@ -328,17 +364,17 @@ repeat: char *tmp; jbd_unlock_bh_state(bh_in); - tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); + tmp = jbd_alloc(bh_in->b_size, GFP_NOFS); jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { - kfree(tmp); + jbd_free(tmp, bh_in->b_size); goto repeat; } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); new_page = virt_to_page(tmp); new_offset = offset_in_page(tmp); @@ -350,19 +386,11 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -379,7 +407,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -421,16 +453,20 @@ int __log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_commit_request != target && + journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* - * We want a new commit: OK, mark the request and wakup the + * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. */ @@ -440,7 +476,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } @@ -489,7 +532,8 @@ int journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -499,15 +543,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __log_start_commit(journal, tid); - if (ret && ptid) + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* - * If ext3_write_super() recently started a commit, then we - * have to wait for completion of that transaction + * If commit has been started, then we have to wait for + * completion of that transaction. */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); @@ -525,13 +573,25 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", - __FUNCTION__, journal->j_commit_request, tid); + __func__, journal->j_commit_request, tid); } spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); @@ -541,22 +601,53 @@ int log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); spin_lock(&journal->j_state_lock); } +out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } /* + * Return 1 if a given transaction has not yet sent barrier request + * connected with a transaction commit. If 0 is returned, transaction + * may or may not have sent the barrier. Used to avoid sending barrier + * twice in common cases. + */ +int journal_trans_will_send_data_barrier(journal_t *journal, tid_t tid) +{ + int ret = 0; + transaction_t *commit_trans; + + if (!(journal->j_flags & JFS_BARRIER)) + return 0; + spin_lock(&journal->j_state_lock); + /* Transaction already committed? */ + if (tid_geq(journal->j_commit_sequence, tid)) + goto out; + /* + * Transaction is being committed and we already proceeded to + * writing commit record? + */ + commit_trans = journal->j_committing_transaction; + if (commit_trans && commit_trans->t_tid == tid && + commit_trans->t_state >= T_COMMIT_RECORD) + goto out; + ret = 1; +out: + spin_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(journal_trans_will_send_data_barrier); + +/* * Log buffer allocation routines: */ -int journal_next_log_block(journal_t *journal, unsigned long *retp) +int journal_next_log_block(journal_t *journal, unsigned int *retp) { - unsigned long blocknr; + unsigned int blocknr; spin_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); @@ -577,11 +668,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp) * this is a no-op. If needed, we can use j_blk_offset - everything is * ready. */ -int journal_bmap(journal_t *journal, unsigned long blocknr, - unsigned long *retp) +int journal_bmap(journal_t *journal, unsigned int blocknr, + unsigned int *retp) { int err = 0; - unsigned long ret; + unsigned int ret; if (journal->j_inode) { ret = bmap(journal->j_inode, blocknr); @@ -591,8 +682,8 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, char b[BDEVNAME_SIZE]; printk(KERN_ALERT "%s: journal block not found " - "at offset %lu on %s\n", - __FUNCTION__, + "at offset %u on %s\n", + __func__, blocknr, bdevname(journal->j_dev, b)); err = -EIO; @@ -617,7 +708,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, struct journal_head *journal_get_descriptor_buffer(journal_t *journal) { struct buffer_head *bh; - unsigned long blocknr; + unsigned int blocknr; int err; err = journal_next_log_block(journal, &blocknr); @@ -626,6 +717,8 @@ struct journal_head *journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -648,10 +741,9 @@ static journal_t * journal_init_common (void) journal_t *journal; int err; - journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); + journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) goto fail; - memset(journal, 0, sizeof(*journal)); init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_logspace); @@ -659,8 +751,7 @@ static journal_t * journal_init_common (void) init_waitqueue_head(&journal->j_wait_checkpoint); init_waitqueue_head(&journal->j_wait_commit); init_waitqueue_head(&journal->j_wait_updates); - init_MUTEX(&journal->j_barrier); - init_MUTEX(&journal->j_checkpoint_sem); + mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); spin_lock_init(&journal->j_state_lock); @@ -691,17 +782,18 @@ fail: */ /** - * journal_t * journal_init_dev() - creates an initialises a journal structure + * journal_t * journal_init_dev() - creates and initialises a journal structure * @bdev: Block device on which to create the journal * @fs_dev: Device which hold journalled filesystem for this journal. * @start: Block nr Start of journal. - * @len: Lenght of the journal in blocks. + * @len: Length of the journal in blocks. * @blocksize: blocksize of journalling device - * @returns: a newly created journal_t * - * + * + * Returns: a newly created journal_t * + * * journal_init_dev creates a journal which maps a fixed contiguous * range of blocks on an arbitrary block device. - * + * */ journal_t * journal_init_dev(struct block_device *bdev, struct block_device *fs_dev, @@ -714,35 +806,42 @@ journal_t * journal_init_dev(struct block_device *bdev, if (!journal) return NULL; + /* journal descriptor can store up to n blocks -bzzz */ + journal->j_blocksize = blocksize; + n = journal->j_blocksize / sizeof(journal_block_tag_t); + journal->j_wbufsize = n; + journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); + if (!journal->j_wbuf) { + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; + } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; - journal->j_blocksize = blocksize; bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; - /* journal descriptor can store up to n blocks -bzzz */ - n = journal->j_blocksize / sizeof(journal_block_tag_t); - journal->j_wbufsize = n; - journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); - if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); - kfree(journal); - journal = NULL; - } - return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; } - -/** + +/** * journal_t * journal_init_inode () - creates a journal which maps to a inode. * @inode: An inode to create the journal in - * + * * journal_init_inode creates a journal which maps an on-disk inode as * the journal. The inode must exist already, must support bmap() and * must have all data blocks preallocated. @@ -753,7 +852,7 @@ journal_t * journal_init_inode (struct inode *inode) journal_t *journal = journal_init_common(); int err; int n; - unsigned long blocknr; + unsigned int blocknr; if (!journal) return NULL; @@ -762,7 +861,7 @@ journal_t * journal_init_inode (struct inode *inode) journal->j_inode = inode; jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", - journal, inode->i_sb->s_id, inode->i_ino, + journal, inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -774,33 +873,40 @@ journal_t * journal_init_inode (struct inode *inode) journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { - printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); - kfree(journal); - return NULL; + printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n", + __func__); + goto out_err; } err = journal_bmap(journal, 0, &blocknr); /* If that failed, give up */ if (err) { - printk(KERN_ERR "%s: Cannnot locate journal superblock\n", - __FUNCTION__); - kfree(journal); - return NULL; + printk(KERN_ERR "%s: Cannot locate journal superblock\n", + __func__); + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + kfree(journal->j_wbuf); + kfree(journal); + return NULL; } -/* +/* * If the journal init or create aborts, we need to mark the journal * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. + * back a bogus superblock. */ static void journal_fail_superblock (journal_t *journal) { @@ -823,6 +929,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; @@ -837,23 +949,47 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); - journal_start_thread(journal); - return 0; + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } + return journal_start_thread(journal); } -/** +/** * int journal_create() - Initialise the new journal file * @journal: Journal to create. This structure must have been initialised - * + * * Given a journal_t structure which tells us which disk blocks we can * use, create a new journal superblock and initialise all of the - * journal fields from scratch. + * journal fields from scratch. **/ int journal_create(journal_t *journal) { - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; journal_superblock_t *sb; int i, err; @@ -871,7 +1007,7 @@ int journal_create(journal_t *journal) */ printk(KERN_EMERG "%s: creation of journal on external device!\n", - __FUNCTION__); + __func__); BUG(); } @@ -883,6 +1019,8 @@ int journal_create(journal_t *journal) if (err) return err; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (unlikely(!bh)) + return -ENOMEM; lock_buffer(bh); memset (bh->b_data, 0, journal->j_blocksize); BUFFER_TRACE(bh, "marking dirty"); @@ -914,62 +1052,131 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. +static void journal_write_superblock(journal_t *journal, int write_op) +{ + struct buffer_head *bh = journal->j_sb_buffer; + int ret; + + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD: previous I/O error detected " + "for journal superblock update for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. */ -void journal_update_superblock(journal_t *journal, int wait) +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; - struct buffer_head *bh = journal->j_sb_buffer; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + journal_write_superblock(journal, write_op); + + /* Log is no longer empty */ spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + spin_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + spin_unlock(&journal->j_state_lock); + return; + } + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) - sync_dirty_buffer(bh); - else - ll_rw_block(SWRITE, 1, &bh); + journal_write_superblock(journal, WRITE_FUA); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); } /* @@ -1025,6 +1232,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_maxlen) { + printk(KERN_WARNING + "JBD: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + return 0; out: @@ -1061,7 +1276,7 @@ static int load_superblock(journal_t *journal) /** * int journal_load() - Read journal from disk. * @journal: Journal to act on. - * + * * Given a journal_t structure which tells us which disk blocks contain * a journal, read the journal from disk to initialise the in-memory * structures. @@ -1069,17 +1284,17 @@ static int load_superblock(journal_t *journal) int journal_load(journal_t *journal) { int err; + journal_superblock_t *sb; err = load_superblock(journal); if (err) return err; + sb = journal->j_superblock; /* If this is a V2 superblock, then we have to check the * features flags on it. */ if (journal->j_format_version >= 2) { - journal_superblock_t *sb = journal->j_superblock; - if ((sb->s_feature_ro_compat & ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -1116,9 +1331,13 @@ recovery_error: * * Release a journal_t structure once it is no longer in use by the * journaled object. + * Return <0 if we couldn't clean up the journal. */ -void journal_destroy(journal_t *journal) +int journal_destroy(journal_t *journal) { + int err = 0; + + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); @@ -1128,6 +1347,8 @@ void journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1141,13 +1362,16 @@ void journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = ++journal->j_transaction_sequence; if (journal->j_sb_buffer) { - journal_update_superblock(journal, 1); + if (!is_journal_aborted(journal)) { + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + mark_journal_empty(journal); + } else + err = -EIO; brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1155,6 +1379,8 @@ void journal_destroy(journal_t *journal) journal_destroy_revoke(journal); kfree(journal->j_wbuf); kfree(journal); + + return err; } @@ -1164,9 +1390,9 @@ void journal_destroy(journal_t *journal) * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features - * + * * Check whether the journal uses all of a given set of - * features. Return true (non-zero) if it does. + * features. Return true (non-zero) if it does. **/ int journal_check_used_features (journal_t *journal, unsigned long compat, @@ -1195,7 +1421,7 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount * @incompat: bitmask of incompatible features - * + * * Check whether the journaling code supports the use of * all of a given set of features on this journal. Return true * (non-zero) if it can. */ @@ -1203,13 +1429,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, int journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1233,7 +1455,7 @@ int journal_check_available_features (journal_t *journal, unsigned long compat, * @incompat: bitmask of incompatible features * * Mark a given journal feature as present on the - * superblock. Returns true if the requested features could be set. + * superblock. Returns true if the requested features could be set. * */ @@ -1319,7 +1541,7 @@ static int journal_convert_superblock_v1(journal_t *journal, /** * int journal_flush () - Flush journal * @journal: Journal to act on. - * + * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that * recovery does not need to happen on remount. @@ -1329,7 +1551,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned long old_tail; spin_lock(&journal->j_state_lock); @@ -1354,10 +1575,17 @@ int journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); err = log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1365,28 +1593,23 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); spin_unlock(&journal->j_state_lock); - return err; + return 0; } /** * int journal_wipe() - Wipe journal contents * @journal: Journal to act on. * @write: flag (see below) - * + * * Wipe out all of the contents of a journal, safely. This will produce * a warning if the journal contains any valid recovery information. * Must be called between journal_init_*() and journal_load(). @@ -1397,7 +1620,6 @@ int journal_flush(journal_t *journal) int journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JFS_LOADED)); @@ -1406,8 +1628,6 @@ int journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; @@ -1415,8 +1635,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) - journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1441,7 +1665,7 @@ static const char *journal_dev_name(journal_t *journal, char *buffer) /* * Journal abort has very specific semantics, which we describe - * for journal abort. + * for journal abort. * * Two internal function, which provide abort to te jbd layer * itself are here. @@ -1452,7 +1676,7 @@ static const char *journal_dev_name(journal_t *journal, char *buffer) * Aborts hard --- we mark the abort as occurred, but do _nothing_ else, * and don't attempt to make any other journal updates. */ -void __journal_abort_hard(journal_t *journal) +static void __journal_abort_hard(journal_t *journal) { transaction_t *transaction; char b[BDEVNAME_SIZE]; @@ -1484,7 +1708,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** @@ -1496,7 +1720,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) * Perform a complete, immediate shutdown of the ENTIRE * journal (not of a single transaction). This operation cannot be * undone without closing and reopening the journal. - * + * * The journal_abort function is intended to support higher level error * recovery mechanisms such as the ext2/ext3 remount-readonly error * mode. @@ -1530,7 +1754,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) * supply an errno; a null errno implies that absolutely no further * writes are done to the journal (unless there are any already in * progress). - * + * */ void journal_abort(journal_t *journal, int errno) @@ -1538,7 +1762,7 @@ void journal_abort(journal_t *journal, int errno) __journal_abort_soft(journal, errno); } -/** +/** * int journal_errno () - returns the journal's error state. * @journal: journal to examine. * @@ -1562,7 +1786,7 @@ int journal_errno(journal_t *journal) return err; } -/** +/** * int journal_clear_err () - clears the journal's error state * @journal: journal to act on. * @@ -1582,7 +1806,7 @@ int journal_clear_err(journal_t *journal) return err; } -/** +/** * void journal_ack_err() - Ack journal err. * @journal: journal to act on. * @@ -1603,18 +1827,9 @@ int journal_blocks_per_page(struct inode *inode) } /* - * Simple support for retrying memory allocations. Introduced to help to - * debug different VM deadlock avoidance strategies. - */ -void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) -{ - return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); -} - -/* * Journal_head storage management */ -static kmem_cache_t *journal_head_cache; +static struct kmem_cache *journal_head_cache; #ifdef CONFIG_JBD_DEBUG static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif @@ -1623,15 +1838,14 @@ static int journal_init_journal_head_cache(void) { int retval; - J_ASSERT(journal_head_cache == 0); + J_ASSERT(journal_head_cache == NULL); journal_head_cache = kmem_cache_create("journal_head", sizeof(struct journal_head), 0, /* offset */ - 0, /* flags */ - NULL, /* ctor */ - NULL); /* dtor */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ retval = 0; - if (journal_head_cache == 0) { + if (!journal_head_cache) { retval = -ENOMEM; printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); } @@ -1640,9 +1854,10 @@ static int journal_init_journal_head_cache(void) static void journal_destroy_journal_head_cache(void) { - J_ASSERT(journal_head_cache != NULL); - kmem_cache_destroy(journal_head_cache); - journal_head_cache = NULL; + if (journal_head_cache) { + kmem_cache_destroy(journal_head_cache); + journal_head_cache = NULL; + } } /* @@ -1651,22 +1866,19 @@ static void journal_destroy_journal_head_cache(void) static struct journal_head *journal_alloc_journal_head(void) { struct journal_head *ret; - static unsigned long last_warning; #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); - if (ret == 0) { + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); + if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); - if (time_after(jiffies, last_warning + 5*HZ)) { - printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __FUNCTION__); - last_warning = jiffies; - } - while (ret == 0) { + printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", + __func__); + + while (ret == NULL) { yield(); - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); } } return ret; @@ -1676,7 +1888,7 @@ static void journal_free_journal_head(struct journal_head *jh) { #ifdef CONFIG_JBD_DEBUG atomic_dec(&nr_journal_heads); - memset(jh, 0x5b, sizeof(*jh)); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); #endif kmem_cache_free(journal_head_cache, jh); } @@ -1695,10 +1907,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -1711,17 +1922,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = journal_add_journal_head(bh); * ... - * jh->b_transaction = xxx; - * journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. + * (Get another reference for transaction) + * journal_grab_journal_head(bh); + * jh->b_transaction = xxx; + * (Put original reference) + * journal_put_journal_head(jh); */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *journal_add_journal_head(struct buffer_head *bh) @@ -1730,10 +1940,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { @@ -1785,61 +1993,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __FUNCTION__); - kfree(jh->b_frozen_data); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __FUNCTION__); - kfree(jh->b_committed_data); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void journal_put_journal_head(struct journal_head *jh) @@ -1849,84 +2025,61 @@ void journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* - * /proc tunables + * debugfs tunables */ -#if defined(CONFIG_JBD_DEBUG) -int journal_enable_debug; -EXPORT_SYMBOL(journal_enable_debug); -#endif +#ifdef CONFIG_JBD_DEBUG -#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS) +u8 journal_enable_debug __read_mostly; +EXPORT_SYMBOL(journal_enable_debug); -static struct proc_dir_entry *proc_jbd_debug; +static struct dentry *jbd_debugfs_dir; +static struct dentry *jbd_debug; -static int read_jbd_debug(char *page, char **start, off_t off, - int count, int *eof, void *data) +static void __init jbd_create_debugfs_entry(void) { - int ret; - - ret = sprintf(page + off, "%d\n", journal_enable_debug); - *eof = 1; - return ret; + jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); + if (jbd_debugfs_dir) + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, + jbd_debugfs_dir, + &journal_enable_debug); } -static int write_jbd_debug(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static void __exit jbd_remove_debugfs_entry(void) { - char buf[32]; - - if (count > ARRAY_SIZE(buf) - 1) - count = ARRAY_SIZE(buf) - 1; - if (copy_from_user(buf, buffer, count)) - return -EFAULT; - buf[ARRAY_SIZE(buf) - 1] = '\0'; - journal_enable_debug = simple_strtoul(buf, NULL, 10); - return count; + debugfs_remove(jbd_debug); + debugfs_remove(jbd_debugfs_dir); } -#define JBD_PROC_NAME "sys/fs/jbd-debug" +#else -static void __init create_jbd_proc_entry(void) +static inline void jbd_create_debugfs_entry(void) { - proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL); - if (proc_jbd_debug) { - /* Why is this so hard? */ - proc_jbd_debug->read_proc = read_jbd_debug; - proc_jbd_debug->write_proc = write_jbd_debug; - } } -static void __exit remove_jbd_proc_entry(void) +static inline void jbd_remove_debugfs_entry(void) { - if (proc_jbd_debug) - remove_proc_entry(JBD_PROC_NAME, NULL); } -#else - -#define create_jbd_proc_entry() do {} while (0) -#define remove_jbd_proc_entry() do {} while (0) - #endif -kmem_cache_t *jbd_handle_cache; +struct kmem_cache *jbd_handle_cache; static int __init journal_init_handle_cache(void) { jbd_handle_cache = kmem_cache_create("journal_handle", sizeof(handle_t), 0, /* offset */ - 0, /* flags */ - NULL, /* ctor */ - NULL); /* dtor */ + SLAB_TEMPORARY, /* flags */ + NULL); /* ctor */ if (jbd_handle_cache == NULL) { printk(KERN_EMERG "JBD: failed to create handle cache\n"); return -ENOMEM; @@ -1967,18 +2120,12 @@ static int __init journal_init(void) { int ret; -/* Static check for data structure consistency. There's no code - * invoked --- we'll just get a linker failure if things aren't right. - */ - extern void journal_bad_superblock_size(void); - if (sizeof(struct journal_superblock_s) != 1024) - journal_bad_superblock_size(); - + BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); ret = journal_init_caches(); if (ret != 0) journal_destroy_caches(); - create_jbd_proc_entry(); + jbd_create_debugfs_entry(); return ret; } @@ -1987,9 +2134,9 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif - remove_jbd_proc_entry(); + jbd_remove_debugfs_entry(); journal_destroy_caches(); } diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 103c34e4fb2..a748fe21465 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -1,6 +1,6 @@ /* - * linux/fs/recovery.c - * + * linux/fs/jbd/recovery.c + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 * * Copyright 1999-2000 Red Hat Software --- All Rights Reserved @@ -10,7 +10,7 @@ * option, any later version, incorporated herein by reference. * * Journal recovery routines for the generic filesystem journaling code; - * part of the ext2fs journaling system. + * part of the ext2fs journaling system. */ #ifndef __KERNEL__ @@ -20,14 +20,14 @@ #include <linux/fs.h> #include <linux/jbd.h> #include <linux/errno.h> -#include <linux/slab.h> +#include <linux/blkdev.h> #endif /* * Maintain information about the progress of the recovery job, so that - * the different passes can carry information between them. + * the different passes can carry information between them. */ -struct recovery_info +struct recovery_info { tid_t start_transaction; tid_t end_transaction; @@ -46,7 +46,7 @@ static int scan_revoke_records(journal_t *, struct buffer_head *, #ifdef __KERNEL__ /* Release readahead buffers after use */ -void journal_brelse_array(struct buffer_head *b[], int n) +static void journal_brelse_array(struct buffer_head *b[], int n) { while (--n >= 0) brelse (b[n]); @@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start) { int err; unsigned int max, nbufs, next; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; struct buffer_head * bufs[MAXBUF]; @@ -116,7 +116,7 @@ static int do_readahead(journal_t *journal, unsigned int start) err = 0; failed: - if (nbufs) + if (nbufs) journal_brelse_array(bufs, nbufs); return err; } @@ -128,11 +128,11 @@ failed: * Read a block from the journal */ -static int jread(struct buffer_head **bhp, journal_t *journal, +static int jread(struct buffer_head **bhp, journal_t *journal, unsigned int offset) { int err; - unsigned long blocknr; + unsigned int blocknr; struct buffer_head *bh; *bhp = NULL; @@ -210,20 +210,20 @@ do { \ } while (0) /** - * int journal_recover(journal_t *journal) - recovers a on-disk journal + * journal_recover - recovers a on-disk journal * @journal: the journal to recover - * + * * The primary function for recovering the log contents when mounting a - * journaled device. + * journaled device. * * Recovery is done in three passes. In the first pass, we look for the * end of the log. In the second, we assemble the list of revoke * blocks. In the third and final pass, we replay any un-revoked blocks - * in the log. + * in the log. */ int journal_recover(journal_t *journal) { - int err; + int err, err2; journal_superblock_t * sb; struct recovery_info info; @@ -231,10 +231,10 @@ int journal_recover(journal_t *journal) memset(&info, 0, sizeof(info)); sb = journal->j_superblock; - /* + /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. */ if (!sb->s_start) { @@ -250,10 +250,10 @@ int journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(0, "JBD: recovery, exit status %d, " + jbd_debug(1, "JBD: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -261,32 +261,38 @@ int journal_recover(journal_t *journal) journal->j_transaction_sequence = ++info.end_transaction; journal_clear_revoke(journal); - sync_blockdev(journal->j_fs_dev); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + /* Flush disk caches to get replayed data on the permanent storage */ + if (journal->j_flags & JFS_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } + return err; } /** - * int journal_skip_recovery() - Start journal and wipe exiting records + * journal_skip_recovery - Start journal and wipe exiting records * @journal: journal to startup - * + * * Locate any valid recovery information from the journal and set up the * journal structures in memory to ignore it (presumably because the - * caller has evidence that it is out of date). + * caller has evidence that it is out of date). * This function does'nt appear to be exorted.. * * We perform one pass over the journal to allow us to tell the user how * much recovery information is being erased, and to let us initialise - * the journal transaction sequence numbers to the next unused ID. + * the journal transaction sequence numbers to the next unused ID. */ int journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; - struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -295,11 +301,12 @@ int journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); -#endif - jbd_debug(0, + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); + jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); +#endif journal->j_transaction_sequence = ++info.end_transaction; } @@ -311,23 +318,18 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned int next_log_block; int err, success = 0; journal_superblock_t * sb; - journal_header_t * tmp; + journal_header_t * tmp; struct buffer_head * bh; unsigned int sequence; int blocktype; - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); - - /* + /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log - * block offsets): query the superblock. + * block offsets): query the superblock. */ sb = journal->j_superblock; @@ -344,7 +346,7 @@ static int do_one_pass(journal_t *journal, * Now we walk through the log, transaction by transaction, * making sure that each transaction has a commit block in the * expected place. Each complete transaction gets replayed back - * into the main filesystem. + * into the main filesystem. */ while (1) { @@ -354,7 +356,7 @@ static int do_one_pass(journal_t *journal, struct buffer_head * obh; struct buffer_head * nbh; - cond_resched(); /* We're under lock_kernel() */ + cond_resched(); /* If we already know where to stop the log traversal, * check right now that we haven't gone past the end of @@ -364,14 +366,14 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n", next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + jbd_debug(3, "JBD: checking block %u\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -379,8 +381,8 @@ static int do_one_pass(journal_t *journal, next_log_block++; wrap(journal, next_log_block); - /* What kind of buffer is it? - * + /* What kind of buffer is it? + * * If it is a descriptor block, check that it has the * expected sequence number. Otherwise, we're all done * here. */ @@ -394,7 +396,7 @@ static int do_one_pass(journal_t *journal, blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", + jbd_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); if (sequence != next_commit_ID) { @@ -426,7 +428,7 @@ static int do_one_pass(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) <= journal->j_blocksize) { - unsigned long io_block; + unsigned int io_block; tag = (journal_block_tag_t *) tagp; flags = be32_to_cpu(tag->t_flags); @@ -438,12 +440,12 @@ static int do_one_pass(journal_t *journal, /* Recover what we can, but * report failure at the end. */ success = err; - printk (KERN_ERR + printk (KERN_ERR "JBD: IO error %d recovering " - "block %ld in log\n", + "block %u in log\n", err, io_block); } else { - unsigned long blocknr; + unsigned int blocknr; J_ASSERT(obh != NULL); blocknr = be32_to_cpu(tag->t_blocknr); @@ -452,7 +454,7 @@ static int do_one_pass(journal_t *journal, * revoked, then we're all done * here. */ if (journal_test_revoke - (journal, blocknr, + (journal, blocknr, next_commit_ID)) { brelse(obh); ++info->nr_revoke_hits; @@ -465,7 +467,7 @@ static int do_one_pass(journal_t *journal, blocknr, journal->j_blocksize); if (nbh == NULL) { - printk(KERN_ERR + printk(KERN_ERR "JBD: Out of memory " "during recovery.\n"); err = -ENOMEM; @@ -478,7 +480,7 @@ static int do_one_pass(journal_t *journal, memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); if (flags & JFS_FLAG_ESCAPE) { - *((__be32 *)bh->b_data) = + *((__be32 *)nbh->b_data) = cpu_to_be32(JFS_MAGIC_NUMBER); } @@ -531,12 +533,13 @@ static int do_one_pass(journal_t *journal, default: jbd_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); + brelse(bh); goto done; } } done: - /* + /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the * log. If the latter happened, then we know that the "current" @@ -566,7 +569,7 @@ static int do_one_pass(journal_t *journal, /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, +static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, tid_t sequence, struct recovery_info *info) { journal_revoke_header_t *header; @@ -577,7 +580,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, max = be32_to_cpu(header->r_count); while (offset < max) { - unsigned long blocknr; + unsigned int blocknr; int err; blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index a5614418346..8898bbd2b61 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -1,6 +1,6 @@ /* - * linux/fs/revoke.c - * + * linux/fs/jbd/revoke.c + * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved @@ -15,10 +15,10 @@ * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: - * + * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal - * + * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log @@ -29,7 +29,7 @@ * single transaction: * * Block is revoked and then journaled: - * The desired end result is the journaling of the new block, so we + * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: @@ -41,12 +41,16 @@ * transaction must have happened after the block was journaled and so * the revoke must take precedence. * - * Block is revoked and then written as data: + * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * + * We cache revoke status of a buffer in the current transaction in b_states + * bits. As the name says, revokevalid flag indicates that the cached revoke + * status of a buffer is valid and we can rely on the cached status. + * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up @@ -54,7 +58,26 @@ * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: - * buffer has been revoked. + * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment no one else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. */ #ifndef __KERNEL__ @@ -66,22 +89,23 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> -#include <linux/smp_lock.h> #include <linux/init.h> +#include <linux/bio.h> #endif +#include <linux/log2.h> -static kmem_cache_t *revoke_record_cache; -static kmem_cache_t *revoke_table_cache; +static struct kmem_cache *revoke_record_cache; +static struct kmem_cache *revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ -struct jbd_revoke_record_s +struct jbd_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ - unsigned long blocknr; + unsigned int blocknr; }; @@ -90,8 +114,8 @@ struct jbd_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ - int hash_size; - int hash_shift; + int hash_size; + int hash_shift; struct list_head *hash_table; }; @@ -99,14 +123,14 @@ struct jbd_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, struct journal_head **, int *, - struct jbd_revoke_record_s *); -static void flush_descriptor(journal_t *, struct journal_head *, int); + struct jbd_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); #endif /* Utility functions to maintain the revoke table */ /* Borrowed from buffer.c: this is a tried and tested block hash function */ -static inline int hash(journal_t *journal, unsigned long block) +static inline int hash(journal_t *journal, unsigned int block) { struct jbd_revoke_table_s *table = journal->j_revoke; int hash_shift = table->hash_shift; @@ -116,7 +140,7 @@ static inline int hash(journal_t *journal, unsigned long block) (block << (hash_shift - 12))) & (table->hash_size - 1); } -static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, +static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, tid_t seq) { struct list_head *hash_list; @@ -138,7 +162,7 @@ repeat: oom: if (!journal_oom_retry) return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); yield(); goto repeat; } @@ -146,7 +170,7 @@ oom: /* Find a revoke record in the journal's hash table. */ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, - unsigned long blocknr) + unsigned int blocknr) { struct list_head *hash_list; struct jbd_revoke_record_s *record; @@ -166,157 +190,140 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } -int __init journal_init_revoke_caches(void) +void journal_destroy_revoke_caches(void) { - revoke_record_cache = kmem_cache_create("revoke_record", - sizeof(struct jbd_revoke_record_s), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (revoke_record_cache == 0) - return -ENOMEM; - - revoke_table_cache = kmem_cache_create("revoke_table", - sizeof(struct jbd_revoke_table_s), - 0, 0, NULL, NULL); - if (revoke_table_cache == 0) { + if (revoke_record_cache) { kmem_cache_destroy(revoke_record_cache); revoke_record_cache = NULL; - return -ENOMEM; } - return 0; -} - -void journal_destroy_revoke_caches(void) -{ - kmem_cache_destroy(revoke_record_cache); - revoke_record_cache = NULL; - kmem_cache_destroy(revoke_table_cache); - revoke_table_cache = NULL; + if (revoke_table_cache) { + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = NULL; + } } -/* Initialise the revoke table for a given journal to a given size. */ - -int journal_init_revoke(journal_t *journal, int hash_size) +int __init journal_init_revoke_caches(void) { - int shift, tmp; + J_ASSERT(!revoke_record_cache); + J_ASSERT(!revoke_table_cache); - J_ASSERT (journal->j_revoke_table[0] == NULL); + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, + SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, + NULL); + if (!revoke_record_cache) + goto record_cache_failure; - shift = 0; - tmp = hash_size; - while((tmp >>= 1UL) != 0UL) - shift++; + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, SLAB_TEMPORARY, NULL); + if (!revoke_table_cache) + goto table_cache_failure; - journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[0]) - return -ENOMEM; - journal->j_revoke = journal->j_revoke_table[0]; + return 0; - /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); +table_cache_failure: + journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; +} - journal->j_revoke->hash_size = hash_size; +static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) +{ + int i; + struct jbd_revoke_table_s *table; - journal->j_revoke->hash_shift = shift; + table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; - journal->j_revoke->hash_table = + table->hash_size = hash_size; + table->hash_shift = ilog2(hash_size); + table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - journal->j_revoke = NULL; - return -ENOMEM; + if (!table->hash_table) { + kmem_cache_free(revoke_table_cache, table); + table = NULL; + goto out; } - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); - journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[1]) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - return -ENOMEM; - } +out: + return table; +} - journal->j_revoke = journal->j_revoke_table[1]; +static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; - /* Check that the hash_size is a power of two */ - J_ASSERT ((hash_size & (hash_size-1)) == 0); + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); + } - journal->j_revoke->hash_size = hash_size; + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); +} - journal->j_revoke->hash_shift = shift; +/* Initialise the revoke table for a given journal to a given size. */ +int journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); + J_ASSERT(is_power_of_2(hash_size)); - journal->j_revoke->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]); - kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]); - journal->j_revoke = NULL; - return -ENOMEM; - } + journal->j_revoke_table[0] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; + + journal->j_revoke_table[1] = journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; -} -/* Destoy a journal's revoke table. The table must already be empty! */ +fail1: + journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} +/* Destroy a journal's revoke table. The table must already be empty! */ void journal_destroy_revoke(journal_t *journal) { - struct jbd_revoke_table_s *table; - struct list_head *hash_list; - int i; - - table = journal->j_revoke_table[0]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); - journal->j_revoke = NULL; - - table = journal->j_revoke_table[1]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(revoke_table_cache, table); journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ -/* +/* * journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the - * revoke. + * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting - * metadata. + * metadata. * * Revoke performs a journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only - * found implicitly. + * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. @@ -325,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal) * by one. */ -int journal_revoke(handle_t *handle, unsigned long blocknr, +int journal_revoke(handle_t *handle, unsigned int blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; @@ -394,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, } } - jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -415,8 +422,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. - * - * The caller must have the journal locked. */ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -474,6 +479,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) return did_revoke; } +/* + * journal_clear_revoked_flags clears revoked flag of buffers in + * revoke table to reflect there is no revoked buffer in the next + * transaction which is going to be started. + */ +void journal_clear_buffer_revoked_flags(journal_t *journal) +{ + struct jbd_revoke_table_s *revoke = journal->j_revoke; + int i = 0; + + for (i = 0; i < revoke->hash_size; i++) { + struct list_head *hash_list; + struct list_head *list_entry; + hash_list = &revoke->hash_table[i]; + + list_for_each(list_entry, hash_list) { + struct jbd_revoke_record_s *record; + struct buffer_head *bh; + record = (struct jbd_revoke_record_s *)list_entry; + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); + } + } + } +} + /* journal_switch_revoke table select j_revoke for next transaction * we do not want to suspend any processing until all revokes are * written -bzzz @@ -487,19 +522,16 @@ void journal_switch_revoke_table(journal_t *journal) else journal->j_revoke = journal->j_revoke_table[0]; - for (i = 0; i < journal->j_revoke->hash_size; i++) + for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. - * - * Called with the journal lock held. */ - -void journal_write_revoke_records(journal_t *journal, - transaction_t *transaction) +void journal_write_revoke_records(journal_t *journal, + transaction_t *transaction, int write_op) { struct journal_head *descriptor; struct jbd_revoke_record_s *record; @@ -507,7 +539,7 @@ void journal_write_revoke_records(journal_t *journal, struct list_head *hash_list; int i, offset, count; - descriptor = NULL; + descriptor = NULL; offset = 0; count = 0; @@ -519,31 +551,32 @@ void journal_write_revoke_records(journal_t *journal, hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { - record = (struct jbd_revoke_record_s *) + record = (struct jbd_revoke_record_s *) hash_list->next; write_one_revoke_record(journal, transaction, - &descriptor, &offset, - record); + &descriptor, &offset, + record, write_op); count++; list_del(&record->hash); kmem_cache_free(revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); jbd_debug(1, "Wrote %d revoke records\n", count); } -/* +/* * Write out one revoke record. We need to create a new descriptor - * block if the old one is full or if we have not already created one. + * block if the old one is full or if we have not already created one. */ -static void write_one_revoke_record(journal_t *journal, +static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, - struct journal_head **descriptorp, + struct journal_head **descriptorp, int *offsetp, - struct jbd_revoke_record_s *record) + struct jbd_revoke_record_s *record, + int write_op) { struct journal_head *descriptor; int offset; @@ -562,7 +595,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } } @@ -584,22 +617,22 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = + * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = cpu_to_be32(record->blocknr); offset += 4; *offsetp = offset; } -/* +/* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ -static void flush_descriptor(journal_t *journal, - struct journal_head *descriptor, - int offset) +static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset, int write_op) { journal_revoke_header_t *header; struct buffer_head *bh = jh2bh(descriptor); @@ -614,11 +647,11 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block(SWRITE, 1, &bh); + write_dirty_buffer(bh, write_op); } #endif -/* +/* * Revoke support for recovery. * * Recovery needs to be able to: @@ -629,7 +662,7 @@ static void flush_descriptor(journal_t *journal, * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) - * + * * empty the revoke table after recovery. */ @@ -637,11 +670,11 @@ static void flush_descriptor(journal_t *journal, * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a - * single block. + * single block. */ -int journal_set_revoke(journal_t *journal, - unsigned long blocknr, +int journal_set_revoke(journal_t *journal, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; @@ -653,19 +686,19 @@ int journal_set_revoke(journal_t *journal, if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; - } + } return insert_revoke_hash(journal, blocknr, sequence); } -/* +/* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ -int journal_test_revoke(journal_t *journal, - unsigned long blocknr, +int journal_test_revoke(journal_t *journal, + unsigned int blocknr, tid_t sequence) { struct jbd_revoke_record_s *record; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 13cb05bf604..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1,6 +1,6 @@ /* - * linux/fs/transaction.c - * + * linux/fs/jbd/transaction.c + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved @@ -10,7 +10,7 @@ * option, any later version, incorporated herein by reference. * * Generic filesystem transaction handling code; part of the ext2fs - * journaling system. + * journaling system. * * This file manages transactions (compound commits managed by the * journaling code) and handles (individual atomic operations by the @@ -23,9 +23,11 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/timer.h> -#include <linux/smp_lock.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> + +static void __journal_temp_unlink_buffer(struct journal_head *jh); /* * get_transaction: obtain a new transaction_t object. @@ -48,13 +50,15 @@ get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer->expires = transaction->t_expires; - add_timer(journal->j_commit_timer); + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; @@ -74,7 +78,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the - * transaction's buffer credits. + * transaction's buffer credits. */ static int start_this_handle(journal_t *journal, handle_t *handle) @@ -95,13 +99,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = jbd_kmalloc(sizeof(*new_transaction), - GFP_NOFS); + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { ret = -ENOMEM; goto out; } - memset(new_transaction, 0, sizeof(*new_transaction)); } jbd_debug(3, "New handle %p going live.\n", handle); @@ -117,7 +120,7 @@ repeat_locked: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { spin_unlock(&journal->j_state_lock); - ret = -EROFS; + ret = -EROFS; goto out; } @@ -182,7 +185,7 @@ repeat_locked: goto repeat; } - /* + /* * The commit code assumes that it can get enough log space * without forcing a checkpoint. This is *critical* for * correctness: a checkpoint of a buffer which is also @@ -191,7 +194,7 @@ repeat_locked: * * We must therefore ensure the necessary space in the journal * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. + * in the new transaction. * * The worst part is, any transaction currently committing can * reduce the free space arbitrarily. Be careful to account for @@ -204,7 +207,7 @@ repeat_locked: * the committing transaction. Really, we only need to give it * committing_transaction->t_outstanding_credits plus "enough" for * the log control blocks. - * Also, this test is inconsitent with the matching one in + * Also, this test is inconsistent with the matching one in * journal_extend(). */ if (__log_space_left(journal) < jbd_space_needed(journal)) { @@ -226,38 +229,44 @@ repeat_locked: __log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: - if (new_transaction) + if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); return ret; } +static struct lock_class_key jbd_handle_key; + /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; + lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); + return handle; } /** - * handle_t *journal_start() - Obtain a new handle. + * handle_t *journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee - * that much space. + * that much space. * * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { @@ -292,11 +301,11 @@ handle_t *journal_start(journal_t *journal, int nblocks) * int journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. - * + * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modications in advance, but can - * extend its credit if it needs more. + * extend its credit if it needs more. * * journal_extend tries to give the running handle more buffer credits. * It does not guarantee that allocation - this is a best-effort only. @@ -360,10 +369,10 @@ out: /** - * int journal_restart() - restart a handle . + * int journal_restart() - restart a handle. * @handle: handle to restart * @nblocks: nr credits requested - * + * * Restart a handle for a multi-transaction filesystem * operation. * @@ -405,6 +414,7 @@ int journal_restart(handle_t *handle, int nblocks) __log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; @@ -415,17 +425,34 @@ int journal_restart(handle_t *handle, int nblocks) * void journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * - * This locks out any further updates from being started, and blocks - * until all existing updates have completed, returning only once the - * journal is in a quiescent state with no updates running. + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. * - * The journal lock should not be held on entry. + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. */ void journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -449,63 +476,33 @@ void journal_lock_updates(journal_t *journal) spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); - - /* - * We have now established a barrier against other normal updates, but - * we also need to barrier against other journal_lock_updates() calls - * to make sure that we serialise special journal-locked operations - * too. - */ - down(&journal->j_barrier); } /** * void journal_unlock_updates (journal_t* journal) - release barrier * @journal: Journal to release the barrier on. - * - * Release a transaction barrier obtained with journal_lock_updates(). * - * Should be called without the journal lock held. + * Release a transaction barrier obtained with journal_lock_updates(). */ void journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); - up(&journal->j_barrier); spin_lock(&journal->j_state_lock); --journal->j_barrier_count; spin_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -535,7 +532,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -547,8 +544,8 @@ repeat: jbd_lock_bh_state(bh); /* We now hold the buffer lock so it is safe to query the buffer - * state. Is the buffer dirty? - * + * state. Is the buffer dirty? + * * If so, there are two possibilities. The buffer may be * non-journaled, and undergoing a quite legitimate writeback. * Otherwise, it is journaled, and we don't expect dirty buffers @@ -566,21 +563,23 @@ repeat: */ if (jh->b_transaction) { J_ASSERT_JH(jh, - jh->b_transaction == transaction || + jh->b_transaction == transaction || jh->b_transaction == journal->j_committing_transaction); if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); - } + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } unlock_buffer(bh); @@ -600,6 +599,12 @@ repeat: goto done; /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ @@ -653,7 +658,7 @@ repeat: * buffer had better remain locked during the kmalloc, * but that should be true --- we hold the journal lock * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. + * list so won't be flushed. * * Subtle point, though: if this is a get_undo_access, * then we will be relying on the frozen_data to contain @@ -666,12 +671,13 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, - GFP_NOFS); + frozen_buffer = + jbd_alloc(jh2bh(jh)->b_size, + GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", - __FUNCTION__); + __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; jbd_lock_bh_state(bh); @@ -695,7 +701,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -711,10 +716,10 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; - source = kmap_atomic(page, KM_USER0); + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source, KM_USER0); + kunmap_atomic(source); } jbd_unlock_bh_state(bh); @@ -725,8 +730,8 @@ done: journal_cancel_revoke(handle, jh); out: - if (frozen_buffer) - kfree(frozen_buffer); + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd_free(frozen_buffer, bh->b_size); JBUFFER_TRACE(jh, "exit"); return error; @@ -736,7 +741,6 @@ out: * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -764,8 +768,8 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * manually rather than reading off disk), then we need to keep the * buffer_head locked until it has been completely filled with new * data. In this case, we should be able to make the assertion that - * the bh is not already part of an existing transaction. - * + * the bh is not already part of an existing transaction. + * * The buffer should already be locked by the caller by this point. * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ @@ -777,7 +781,7 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * * Call this if you create a new bh. */ -int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -809,10 +813,25 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { - jh->b_transaction = transaction; + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); __journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "set next transaction"); jh->b_next_transaction = transaction; } @@ -828,17 +847,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } /** - * int journal_get_undo_access() - Notify intent to modify metadata with - * non-rewindable consequences + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses @@ -846,13 +863,13 @@ out: * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. - * + * * To deal with that, journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. - * + * * We never need to know which transaction the committed data is part * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point @@ -879,10 +896,10 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) { - committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", - __FUNCTION__); + printk(KERN_ERR "%s: No memory for committed data\n", + __func__); err = -ENOMEM; goto out; } @@ -905,18 +922,20 @@ repeat: jbd_unlock_bh_state(bh); out: journal_put_journal_head(jh); - if (committed_data) - kfree(committed_data); + if (unlikely(committed_data)) + jbd_free(committed_data, bh->b_size); return err; } -/** - * int journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. +/** + * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed * @handle: transaction * @bh: bufferhead to mark - * + * + * Description: + * Mark a buffer as containing dirty data which needs to be flushed before + * we can commit the current transaction. + * * The buffer is placed on the transaction's data list and is marked as * belonging to the transaction. * @@ -930,9 +949,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -945,15 +965,15 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) /* * What if the buffer is already part of a running transaction? - * + * * There are two cases: * 1) It is part of the current running transaction. Refile it, * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. + * it, then reallocated it as data. * 2) It is part of the previous, still-committing transaction. * If all we want to do is to guarantee that the buffer will be * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same + * being sure that the *previous* transaction has this same * property is sufficient for us! Just leave it on its old * transaction. * @@ -966,6 +986,13 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) */ jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); + + /* Now that we have bh_state locked, are we really still mapped? */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); + goto no_journal; + } + if (jh->b_transaction) { JBUFFER_TRACE(jh, "has transaction"); if (jh->b_transaction != handle->h_transaction) { @@ -1027,12 +1054,27 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) sync_dirty_buffer(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); + /* Since we dropped the lock... */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "buffer got unmapped"); + goto no_journal; + } /* The buffer may become locked again at any time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1053,8 +1095,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1072,21 +1112,21 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } -/** - * int journal_dirty_metadata() - mark a buffer as containing dirty metadata +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. - * @bh: buffer to mark - * - * mark dirty metadata which needs to be journaled as part of the current + * @bh: buffer to mark + * + * Mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer is placed on the transaction's metadata list and is marked - * as belonging to the transaction. + * as belonging to the transaction. * - * Returns error number or 0 on success. + * Returns error number or 0 on success. * * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen @@ -1134,11 +1174,11 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) set_buffer_jbddirty(bh); - /* + /* * Metadata already on the current transaction list doesn't * need to be filed. Metadata on another transaction's list must * be committing, and will be refiled once the commit completes: - * leave it alone for now. + * leave it alone for now. */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); @@ -1151,7 +1191,7 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == 0); + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); @@ -1164,7 +1204,7 @@ out: return 0; } -/* +/* * journal_release_buffer: undo a get_write_access without any buffer * updates, if the update decided in the end that it didn't need access. * @@ -1175,20 +1215,20 @@ journal_release_buffer(handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); } -/** +/** * void journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we - * can safely unlink it. + * can safely unlink it. * * bh may not be a journalled buffer at all - it may be a non-JBD * buffer which came off the hashtable. Check for this. * * Decrements bh->b_count by one. - * + * * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ @@ -1199,6 +1239,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int drop_reserve = 0; int err = 0; + int was_modified = 0; BUFFER_TRACE(bh, "entry"); @@ -1217,6 +1258,9 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } + /* keep track of whether or not this transaction modified us */ + was_modified = jh->b_modified; + /* * The buffer's going from the transaction, we must drop * all references -bzzz @@ -1234,9 +1278,14 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - drop_reserve = 1; + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; - /* + /* * We are no longer going to journal this buffer. * However, the commit of this transaction is still * important to the buffer: the delete that we are now @@ -1245,7 +1294,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) * * So, if we have a checkpoint on the buffer, we should * now refile the buffer on our BJ_Forget list so that - * we know to remove the checkpoint after we commit. + * we know to remove the checkpoint after we commit. */ if (jh->b_cp_transaction) { @@ -1253,8 +1302,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1263,7 +1310,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) } } } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == + J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ @@ -1274,7 +1321,13 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); jh->b_next_transaction = NULL; - drop_reserve = 1; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; } } @@ -1293,7 +1346,7 @@ drop: /** * int journal_stop() - complete a transaction * @handle: tranaction to complete. - * + * * All done for a particular handle. * * There is not much action needed here. We just return any remaining @@ -1302,7 +1355,7 @@ drop: * filesystem is marked for synchronous update. * * journal_stop itself will not usually return an error, but it may - * do so in unusual circumstances. In particular, expect it to + * do so in unusual circumstances. In particular, expect it to * return -EIO if a journal_abort has been executed since the * transaction began. */ @@ -1310,15 +1363,17 @@ int journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; + pid_t pid; - J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); if (is_handle_aborted(handle)) err = -EIO; - else + else { + J_ASSERT(transaction->t_updates > 0); err = 0; + } if (--handle->h_ref > 0) { jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, @@ -1336,12 +1391,45 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is useful for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; @@ -1364,7 +1452,7 @@ int journal_stop(handle_t *handle) if (handle->h_sync || transaction->t_outstanding_credits > journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ @@ -1379,7 +1467,7 @@ int journal_stop(handle_t *handle) /* * Special case: JFS_SYNC synchronous updates require us - * to wait for the commit to complete. + * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) err = log_wait_commit(journal, tid); @@ -1388,11 +1476,14 @@ int journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } + lock_map_release(&handle->h_lockdep_map); + jbd_free_handle(handle); return err; } -/**int journal_force_commit() - force any uncommitted transactions +/** + * int journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * For synchronous operations: force any uncommitted transactions @@ -1430,7 +1521,7 @@ int journal_force_commit(journal_t *journal) * jbd_lock_bh_state(jh2bh(jh)) is held. */ -static inline void +static inline void __blist_add_buffer(struct journal_head **list, struct journal_head *jh) { if (!*list) { @@ -1445,7 +1536,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) } } -/* +/* * Remove a buffer from a transaction list, given the transaction's list * head pointer. * @@ -1466,7 +1557,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) jh->b_tnext->b_tprev = jh->b_tprev; } -/* +/* * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of @@ -1477,7 +1568,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * * Called under j_list_lock. The journal may not be locked. */ -void __journal_temp_unlink_buffer(struct journal_head *jh) +static void __journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; @@ -1490,7 +1581,7 @@ void __journal_temp_unlink_buffer(struct journal_head *jh) J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) - J_ASSERT_JH(jh, transaction != 0); + J_ASSERT_JH(jh, transaction != NULL); switch (jh->b_jlist) { case BJ_None: @@ -1529,19 +1620,32 @@ void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1559,25 +1663,21 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != 0) + if (jh->b_next_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } - } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1585,18 +1685,19 @@ out: return; } - -/** +/** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * * - * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN * so try_to_free_buffers() can reap them. - * + * * This function returns non-zero if we wish try_to_free_buffers() * to be called. We do this if the page is releasable by try_to_free_buffers(). * We also do it if the page has locked or dirty buffers and the caller wants @@ -1619,9 +1720,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ -int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1637,7 +1740,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1650,7 +1753,9 @@ int journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + busy: return ret; } @@ -1672,23 +1777,26 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } /* - * journal_invalidatepage + * journal_invalidatepage * * This code is tricky. It has a number of cases to deal with. * @@ -1696,15 +1804,15 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * * i_size must be updated on disk before we start calling invalidatepage on the * data. - * + * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this * invariant, we can be sure that it is safe to throw away any buffers * attached to the current transaction: once the transaction commits, * we know that the data will not be needed. - * + * * Note however that we can *not* throw away data belonging to the - * previous, committing transaction! + * previous, committing transaction! * * Any disk blocks which *are* part of the previous, committing * transaction (and which therefore cannot be discarded immediately) are @@ -1723,7 +1831,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * don't make guarantees about the order in which data hits disk --- in * particular we don't guarantee that new dirty data is flushed before * transaction commit --- so it is always safe just to discard data - * immediately in that mode. --sct + * immediately in that mode. --sct */ /* @@ -1734,15 +1842,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1760,6 +1869,29 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1785,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1799,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1814,6 +1938,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); if (jh->b_jlist == BJ_Locked) { /* * The buffer is on the committing transaction's locked @@ -1824,17 +1949,31 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ - JBUFFER_TRACE(jh, "on committing transaction"); - set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); + log_wait_commit(journal, tid); + lock_buffer(bh); + goto retry; } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1848,10 +1987,19 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * i_size already for this truncate so recovery will not * expose the disk blocks we are discarding here.) */ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); may_free = __dispose_buffer(jh, transaction); } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -1867,28 +2015,32 @@ zap_buffer_unlocked: return may_free; } -/** - * int journal_invalidatepage() - * @journal: journal to use for flush... +/** + * void journal_invalidatepage() - invalidate a journal page + * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. - * - * Return non-zero if the page's buffers were successfully reaped. + * Reap page buffers containing data in specified range in page. */ -int journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned long offset) +void journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) BUG(); if (!page_has_buffers(page)) - return 1; + return; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be @@ -1899,10 +2051,14 @@ int journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { - /* This block is wholly outside the truncation point */ + /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -1910,16 +2066,14 @@ int journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { - if (!may_free || !try_to_free_buffers(page)) - return 0; - J_ASSERT(!page_has_buffers(page)); + if (!partial_page) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); } - return 1; } -/* - * File a buffer on the given transaction list. +/* + * File a buffer on the given transaction list. */ void __journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) @@ -1933,17 +2087,22 @@ void __journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_transaction == 0); + jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || + if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; @@ -1951,6 +2110,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2002,19 +2163,20 @@ void journal_file_buffer(struct journal_head *jh, jbd_unlock_bh_state(jh2bh(jh)); } -/* +/* * Remove a buffer from its current buffer list in preparation for * dropping it from its current transaction entirely. If the buffer has * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2034,9 +2196,20 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) @@ -2044,30 +2217,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } |
