diff options
Diffstat (limited to 'fs/jbd/transaction.c')
| -rw-r--r-- | fs/jbd/transaction.c | 662 |
1 files changed, 413 insertions, 249 deletions
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 13cb05bf604..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1,6 +1,6 @@ /* - * linux/fs/transaction.c - * + * linux/fs/jbd/transaction.c + * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * * Copyright 1998 Red Hat corp --- All Rights Reserved @@ -10,7 +10,7 @@ * option, any later version, incorporated herein by reference. * * Generic filesystem transaction handling code; part of the ext2fs - * journaling system. + * journaling system. * * This file manages transactions (compound commits managed by the * journaling code) and handles (individual atomic operations by the @@ -23,9 +23,11 @@ #include <linux/errno.h> #include <linux/slab.h> #include <linux/timer.h> -#include <linux/smp_lock.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> + +static void __journal_temp_unlink_buffer(struct journal_head *jh); /* * get_transaction: obtain a new transaction_t object. @@ -48,13 +50,15 @@ get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer->expires = transaction->t_expires; - add_timer(journal->j_commit_timer); + journal->j_commit_timer.expires = + round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); journal->j_running_transaction = transaction; @@ -74,7 +78,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) * start_this_handle: Given a handle, deal with any locking or stalling * needed to make sure that there is enough journal space for the handle * to begin. Attach the handle to a transaction and set up the - * transaction's buffer credits. + * transaction's buffer credits. */ static int start_this_handle(journal_t *journal, handle_t *handle) @@ -95,13 +99,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = jbd_kmalloc(sizeof(*new_transaction), - GFP_NOFS); + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { ret = -ENOMEM; goto out; } - memset(new_transaction, 0, sizeof(*new_transaction)); } jbd_debug(3, "New handle %p going live.\n", handle); @@ -117,7 +120,7 @@ repeat_locked: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { spin_unlock(&journal->j_state_lock); - ret = -EROFS; + ret = -EROFS; goto out; } @@ -182,7 +185,7 @@ repeat_locked: goto repeat; } - /* + /* * The commit code assumes that it can get enough log space * without forcing a checkpoint. This is *critical* for * correctness: a checkpoint of a buffer which is also @@ -191,7 +194,7 @@ repeat_locked: * * We must therefore ensure the necessary space in the journal * *before* starting to dirty potentially checkpointed buffers - * in the new transaction. + * in the new transaction. * * The worst part is, any transaction currently committing can * reduce the free space arbitrarily. Be careful to account for @@ -204,7 +207,7 @@ repeat_locked: * the committing transaction. Really, we only need to give it * committing_transaction->t_outstanding_credits plus "enough" for * the log control blocks. - * Also, this test is inconsitent with the matching one in + * Also, this test is inconsistent with the matching one in * journal_extend(). */ if (__log_space_left(journal) < jbd_space_needed(journal)) { @@ -226,38 +229,44 @@ repeat_locked: __log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); + + lock_map_acquire(&handle->h_lockdep_map); out: - if (new_transaction) + if (unlikely(new_transaction)) /* It's usually NULL */ kfree(new_transaction); return ret; } +static struct lock_class_key jbd_handle_key; + /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; + lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0); + return handle; } /** - * handle_t *journal_start() - Obtain a new handle. + * handle_t *journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * * We make sure that the transaction can guarantee at least nblocks of * modified buffers in the log. We block until the log can guarantee - * that much space. + * that much space. * * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { @@ -292,11 +301,11 @@ handle_t *journal_start(journal_t *journal, int nblocks) * int journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. - * + * * Some transactions, such as large extends and truncates, can be done * atomically all at once or in several stages. The operation requests * a credit for a number of buffer modications in advance, but can - * extend its credit if it needs more. + * extend its credit if it needs more. * * journal_extend tries to give the running handle more buffer credits. * It does not guarantee that allocation - this is a best-effort only. @@ -360,10 +369,10 @@ out: /** - * int journal_restart() - restart a handle . + * int journal_restart() - restart a handle. * @handle: handle to restart * @nblocks: nr credits requested - * + * * Restart a handle for a multi-transaction filesystem * operation. * @@ -405,6 +414,7 @@ int journal_restart(handle_t *handle, int nblocks) __log_start_commit(journal, transaction->t_tid); spin_unlock(&journal->j_state_lock); + lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle); return ret; @@ -415,17 +425,34 @@ int journal_restart(handle_t *handle, int nblocks) * void journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * - * This locks out any further updates from being started, and blocks - * until all existing updates have completed, returning only once the - * journal is in a quiescent state with no updates running. + * This locks out any further updates from being started, and blocks until all + * existing updates have completed, returning only once the journal is in a + * quiescent state with no updates running. * - * The journal lock should not be held on entry. + * We do not use simple mutex for synchronization as there are syscalls which + * want to return with filesystem locked and that trips up lockdep. Also + * hibernate needs to lock filesystem but locked mutex then blocks hibernation. + * Since locking filesystem is rare operation, we use simple counter and + * waitqueue for locking. */ void journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); +wait: + /* Wait for previous locked operation to finish */ + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + spin_lock(&journal->j_state_lock); + /* + * Check reliably under the lock whether we are the ones winning the race + * and locking the journal + */ + if (journal->j_barrier_count > 0) { + spin_unlock(&journal->j_state_lock); + goto wait; + } ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -449,63 +476,33 @@ void journal_lock_updates(journal_t *journal) spin_lock(&journal->j_state_lock); } spin_unlock(&journal->j_state_lock); - - /* - * We have now established a barrier against other normal updates, but - * we also need to barrier against other journal_lock_updates() calls - * to make sure that we serialise special journal-locked operations - * too. - */ - down(&journal->j_barrier); } /** * void journal_unlock_updates (journal_t* journal) - release barrier * @journal: Journal to release the barrier on. - * - * Release a transaction barrier obtained with journal_lock_updates(). * - * Should be called without the journal lock held. + * Release a transaction barrier obtained with journal_lock_updates(). */ void journal_unlock_updates (journal_t *journal) { J_ASSERT(journal->j_barrier_count != 0); - up(&journal->j_barrier); spin_lock(&journal->j_state_lock); --journal->j_barrier_count; spin_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -535,7 +532,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, transaction = handle->h_transaction; journal = transaction->t_journal; - jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -547,8 +544,8 @@ repeat: jbd_lock_bh_state(bh); /* We now hold the buffer lock so it is safe to query the buffer - * state. Is the buffer dirty? - * + * state. Is the buffer dirty? + * * If so, there are two possibilities. The buffer may be * non-journaled, and undergoing a quite legitimate writeback. * Otherwise, it is journaled, and we don't expect dirty buffers @@ -566,21 +563,23 @@ repeat: */ if (jh->b_transaction) { J_ASSERT_JH(jh, - jh->b_transaction == transaction || + jh->b_transaction == transaction || jh->b_transaction == journal->j_committing_transaction); if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); - } + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); + } unlock_buffer(bh); @@ -600,6 +599,12 @@ repeat: goto done; /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ @@ -653,7 +658,7 @@ repeat: * buffer had better remain locked during the kmalloc, * but that should be true --- we hold the journal lock * still and the buffer is already on the BUF_JOURNAL - * list so won't be flushed. + * list so won't be flushed. * * Subtle point, though: if this is a get_undo_access, * then we will be relying on the frozen_data to contain @@ -666,12 +671,13 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, - GFP_NOFS); + frozen_buffer = + jbd_alloc(jh2bh(jh)->b_size, + GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", - __FUNCTION__); + __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; jbd_lock_bh_state(bh); @@ -695,7 +701,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __journal_file_buffer(jh, transaction, BJ_Reserved); @@ -711,10 +716,10 @@ done: J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), "Possible IO failure.\n"); page = jh2bh(jh)->b_page; - offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; - source = kmap_atomic(page, KM_USER0); + offset = offset_in_page(jh2bh(jh)->b_data); + source = kmap_atomic(page); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source, KM_USER0); + kunmap_atomic(source); } jbd_unlock_bh_state(bh); @@ -725,8 +730,8 @@ done: journal_cancel_revoke(handle, jh); out: - if (frozen_buffer) - kfree(frozen_buffer); + if (unlikely(frozen_buffer)) /* It's usually NULL */ + jbd_free(frozen_buffer, bh->b_size); JBUFFER_TRACE(jh, "exit"); return error; @@ -736,7 +741,6 @@ out: * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -764,8 +768,8 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * manually rather than reading off disk), then we need to keep the * buffer_head locked until it has been completely filled with new * data. In this case, we should be able to make the assertion that - * the bh is not already part of an existing transaction. - * + * the bh is not already part of an existing transaction. + * * The buffer should already be locked by the caller by this point. * There is no lock ranking violation: it was a newly created, * unlocked buffer beforehand. */ @@ -777,7 +781,7 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh) * * Call this if you create a new bh. */ -int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -809,10 +813,25 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { - jh->b_transaction = transaction; + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); + + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); __journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "set next transaction"); jh->b_next_transaction = transaction; } @@ -828,17 +847,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) */ JBUFFER_TRACE(jh, "cancelling revoke"); journal_cancel_revoke(handle, jh); - journal_put_journal_head(jh); out: + journal_put_journal_head(jh); return err; } /** - * int journal_get_undo_access() - Notify intent to modify metadata with - * non-rewindable consequences + * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses @@ -846,13 +863,13 @@ out: * do not reuse freed space until the deallocation has been committed, * since if we overwrote that space we would make the delete * un-rewindable in case of a crash. - * + * * To deal with that, journal_get_undo_access requests write access to a * buffer for parts of non-rewindable operations such as delete * operations on the bitmaps. The journaling code must keep a copy of * the buffer's contents prior to the undo_access call until such time * as we know that the buffer has definitely been committed to disk. - * + * * We never need to know which transaction the committed data is part * of, buffers touched here are guaranteed to be dirtied later and so * will be committed to a new transaction in due course, at which point @@ -879,10 +896,10 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) { - committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", - __FUNCTION__); + printk(KERN_ERR "%s: No memory for committed data\n", + __func__); err = -ENOMEM; goto out; } @@ -905,18 +922,20 @@ repeat: jbd_unlock_bh_state(bh); out: journal_put_journal_head(jh); - if (committed_data) - kfree(committed_data); + if (unlikely(committed_data)) + jbd_free(committed_data, bh->b_size); return err; } -/** - * int journal_dirty_data() - mark a buffer as containing dirty data which - * needs to be flushed before we can commit the - * current transaction. +/** + * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed * @handle: transaction * @bh: bufferhead to mark - * + * + * Description: + * Mark a buffer as containing dirty data which needs to be flushed before + * we can commit the current transaction. + * * The buffer is placed on the transaction's data list and is marked as * belonging to the transaction. * @@ -930,9 +949,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) journal_t *journal = handle->h_transaction->t_journal; int need_brelse = 0; struct journal_head *jh; + int ret = 0; if (is_handle_aborted(handle)) - return 0; + return ret; jh = journal_add_journal_head(bh); JBUFFER_TRACE(jh, "entry"); @@ -945,15 +965,15 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) /* * What if the buffer is already part of a running transaction? - * + * * There are two cases: * 1) It is part of the current running transaction. Refile it, * just in case we have allocated it as metadata, deallocated - * it, then reallocated it as data. + * it, then reallocated it as data. * 2) It is part of the previous, still-committing transaction. * If all we want to do is to guarantee that the buffer will be * written to disk before this new transaction commits, then - * being sure that the *previous* transaction has this same + * being sure that the *previous* transaction has this same * property is sufficient for us! Just leave it on its old * transaction. * @@ -966,6 +986,13 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) */ jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); + + /* Now that we have bh_state locked, are we really still mapped? */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); + goto no_journal; + } + if (jh->b_transaction) { JBUFFER_TRACE(jh, "has transaction"); if (jh->b_transaction != handle->h_transaction) { @@ -1027,12 +1054,27 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) sync_dirty_buffer(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); + /* Since we dropped the lock... */ + if (!buffer_mapped(bh)) { + JBUFFER_TRACE(jh, "buffer got unmapped"); + goto no_journal; + } /* The buffer may become locked again at any time if it is redirtied */ } - /* journal_clean_data_list() may have got there first */ - if (jh->b_transaction != NULL) { + /* + * We cannot remove the buffer with io error from the + * committing transaction, because otherwise it would + * miss the error and the commit would not abort. + */ + if (unlikely(!buffer_uptodate(bh))) { + ret = -EIO; + goto no_journal; + } + /* We might have slept so buffer could be refiled now */ + if (jh->b_transaction != NULL && + jh->b_transaction != handle->h_transaction) { JBUFFER_TRACE(jh, "unfile from commit"); __journal_temp_unlink_buffer(jh); /* It still points to the committing @@ -1053,8 +1095,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { JBUFFER_TRACE(jh, "not on correct data list: unfile"); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); - __journal_temp_unlink_buffer(jh); - jh->b_transaction = handle->h_transaction; JBUFFER_TRACE(jh, "file as data"); __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); @@ -1072,21 +1112,21 @@ no_journal: } JBUFFER_TRACE(jh, "exit"); journal_put_journal_head(jh); - return 0; + return ret; } -/** - * int journal_dirty_metadata() - mark a buffer as containing dirty metadata +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. - * @bh: buffer to mark - * - * mark dirty metadata which needs to be journaled as part of the current + * @bh: buffer to mark + * + * Mark dirty metadata which needs to be journaled as part of the current * transaction. * * The buffer is placed on the transaction's metadata list and is marked - * as belonging to the transaction. + * as belonging to the transaction. * - * Returns error number or 0 on success. + * Returns error number or 0 on success. * * Special care needs to be taken if the buffer already belongs to the * current committing transaction (in which case we should have frozen @@ -1134,11 +1174,11 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) set_buffer_jbddirty(bh); - /* + /* * Metadata already on the current transaction list doesn't * need to be filed. Metadata on another transaction's list must * be committing, and will be refiled once the commit completes: - * leave it alone for now. + * leave it alone for now. */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); @@ -1151,7 +1191,7 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } /* That test should have eliminated the following case: */ - J_ASSERT_JH(jh, jh->b_frozen_data == 0); + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); JBUFFER_TRACE(jh, "file as BJ_Metadata"); spin_lock(&journal->j_list_lock); @@ -1164,7 +1204,7 @@ out: return 0; } -/* +/* * journal_release_buffer: undo a get_write_access without any buffer * updates, if the update decided in the end that it didn't need access. * @@ -1175,20 +1215,20 @@ journal_release_buffer(handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); } -/** +/** * void journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * * We can only do the bforget if there are no commits pending against the * buffer. If the buffer is dirty in the current running transaction we - * can safely unlink it. + * can safely unlink it. * * bh may not be a journalled buffer at all - it may be a non-JBD * buffer which came off the hashtable. Check for this. * * Decrements bh->b_count by one. - * + * * Allow this call even if the handle has aborted --- it may be part of * the caller's cleanup after an abort. */ @@ -1199,6 +1239,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int drop_reserve = 0; int err = 0; + int was_modified = 0; BUFFER_TRACE(bh, "entry"); @@ -1217,6 +1258,9 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } + /* keep track of whether or not this transaction modified us */ + was_modified = jh->b_modified; + /* * The buffer's going from the transaction, we must drop * all references -bzzz @@ -1234,9 +1278,14 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - drop_reserve = 1; + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; - /* + /* * We are no longer going to journal this buffer. * However, the commit of this transaction is still * important to the buffer: the delete that we are now @@ -1245,7 +1294,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) * * So, if we have a checkpoint on the buffer, we should * now refile the buffer on our BJ_Forget list so that - * we know to remove the checkpoint after we commit. + * we know to remove the checkpoint after we commit. */ if (jh->b_cp_transaction) { @@ -1253,8 +1302,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) __journal_file_buffer(jh, transaction, BJ_Forget); } else { __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1263,7 +1310,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) } } } else if (jh->b_transaction) { - J_ASSERT_JH(jh, (jh->b_transaction == + J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ @@ -1274,7 +1321,13 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); jh->b_next_transaction = NULL; - drop_reserve = 1; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; } } @@ -1293,7 +1346,7 @@ drop: /** * int journal_stop() - complete a transaction * @handle: tranaction to complete. - * + * * All done for a particular handle. * * There is not much action needed here. We just return any remaining @@ -1302,7 +1355,7 @@ drop: * filesystem is marked for synchronous update. * * journal_stop itself will not usually return an error, but it may - * do so in unusual circumstances. In particular, expect it to + * do so in unusual circumstances. In particular, expect it to * return -EIO if a journal_abort has been executed since the * transaction began. */ @@ -1310,15 +1363,17 @@ int journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; + pid_t pid; - J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); if (is_handle_aborted(handle)) err = -EIO; - else + else { + J_ASSERT(transaction->t_updates > 0); err = 0; + } if (--handle->h_ref > 0) { jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, @@ -1336,12 +1391,45 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is useful for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + + journal->j_last_sync_writer = pid; + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; @@ -1364,7 +1452,7 @@ int journal_stop(handle_t *handle) if (handle->h_sync || transaction->t_outstanding_credits > journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ @@ -1379,7 +1467,7 @@ int journal_stop(handle_t *handle) /* * Special case: JFS_SYNC synchronous updates require us - * to wait for the commit to complete. + * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) err = log_wait_commit(journal, tid); @@ -1388,11 +1476,14 @@ int journal_stop(handle_t *handle) spin_unlock(&journal->j_state_lock); } + lock_map_release(&handle->h_lockdep_map); + jbd_free_handle(handle); return err; } -/**int journal_force_commit() - force any uncommitted transactions +/** + * int journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * For synchronous operations: force any uncommitted transactions @@ -1430,7 +1521,7 @@ int journal_force_commit(journal_t *journal) * jbd_lock_bh_state(jh2bh(jh)) is held. */ -static inline void +static inline void __blist_add_buffer(struct journal_head **list, struct journal_head *jh) { if (!*list) { @@ -1445,7 +1536,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh) } } -/* +/* * Remove a buffer from a transaction list, given the transaction's list * head pointer. * @@ -1466,7 +1557,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) jh->b_tnext->b_tprev = jh->b_tprev; } -/* +/* * Remove a buffer from the appropriate transaction list. * * Note that this function can *change* the value of @@ -1477,7 +1568,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) * * Called under j_list_lock. The journal may not be locked. */ -void __journal_temp_unlink_buffer(struct journal_head *jh) +static void __journal_temp_unlink_buffer(struct journal_head *jh) { struct journal_head **list = NULL; transaction_t *transaction; @@ -1490,7 +1581,7 @@ void __journal_temp_unlink_buffer(struct journal_head *jh) J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); if (jh->b_jlist != BJ_None) - J_ASSERT_JH(jh, transaction != 0); + J_ASSERT_JH(jh, transaction != NULL); switch (jh->b_jlist) { case BJ_None: @@ -1529,19 +1620,32 @@ void __journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ void __journal_unfile_buffer(struct journal_head *jh) { __journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + journal_put_journal_head(jh); } void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1559,25 +1663,21 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != 0) + if (jh->b_next_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { /* A written-back ordered data buffer */ JBUFFER_TRACE(jh, "release data"); __journal_unfile_buffer(jh); - journal_remove_journal_head(bh); - __brelse(bh); } - } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __journal_remove_checkpoint(jh); - journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1585,18 +1685,19 @@ out: return; } - -/** +/** * int journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free - * @unused_gfp_mask: unused + * @gfp_mask: we use the mask to detect how hard should we try to release + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to + * release the buffers. + * * - * * For all the buffers on this page, * if they are fully written out ordered data, move them onto BUF_CLEAN * so try_to_free_buffers() can reap them. - * + * * This function returns non-zero if we wish try_to_free_buffers() * to be called. We do this if the page is releasable by try_to_free_buffers(). * We also do it if the page has locked or dirty buffers and the caller wants @@ -1619,9 +1720,11 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? + * + * Return 0 on failure, 1 on success */ -int journal_try_to_free_buffers(journal_t *journal, - struct page *page, gfp_t unused_gfp_mask) +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, gfp_t gfp_mask) { struct buffer_head *head; struct buffer_head *bh; @@ -1637,7 +1740,7 @@ int journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * journal_remove_journal_head() and journal_put_journal_head(). + * journal_put_journal_head(). */ jh = journal_grab_journal_head(bh); if (!jh) @@ -1650,7 +1753,9 @@ int journal_try_to_free_buffers(journal_t *journal, if (buffer_jbd(bh)) goto busy; } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); + busy: return ret; } @@ -1672,23 +1777,26 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_temp_unlink_buffer(jh); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - journal_remove_journal_head(bh); - __brelse(bh); + __journal_unfile_buffer(jh); } return may_free; } /* - * journal_invalidatepage + * journal_invalidatepage * * This code is tricky. It has a number of cases to deal with. * @@ -1696,15 +1804,15 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * * i_size must be updated on disk before we start calling invalidatepage on the * data. - * + * * This is done in ext3 by defining an ext3_setattr method which * updates i_size before truncate gets going. By maintaining this * invariant, we can be sure that it is safe to throw away any buffers * attached to the current transaction: once the transaction commits, * we know that the data will not be needed. - * + * * Note however that we can *not* throw away data belonging to the - * previous, committing transaction! + * previous, committing transaction! * * Any disk blocks which *are* part of the previous, committing * transaction (and which therefore cannot be discarded immediately) are @@ -1723,7 +1831,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * don't make guarantees about the order in which data hits disk --- in * particular we don't guarantee that new dirty data is flushed before * transaction commit --- so it is always safe just to discard data - * immediately in that mode. --sct + * immediately in that mode. --sct */ /* @@ -1734,15 +1842,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1760,6 +1869,29 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) if (!jh) goto zap_buffer_no_jh; + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) + * is committed. Otherwise if the transaction changing the + * buffer would be cleaned from the journal before T is + * committed, a crash will cause that the correct contents of + * the buffer will be lost. On the other hand we have to + * clear the buffer dirty bit at latest at the moment when the + * transaction marking the buffer as freed in the filesystem + * structures is committed because from that moment on the + * block can be reallocated and used by a different page. + * Since the block hasn't been freed yet but the inode has + * already been added to orphan list, it is safe for us to add + * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. + */ transaction = jh->b_transaction; if (transaction == NULL) { /* First case: not on any transaction. If it @@ -1785,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1799,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1814,6 +1938,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } } } else if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "on committing transaction"); if (jh->b_jlist == BJ_Locked) { /* * The buffer is on the committing transaction's locked @@ -1824,17 +1949,31 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer; } /* - * If it is committing, we simply cannot touch it. We - * can remove it's next_transaction pointer from the - * running transaction if that is set, but nothing - * else. */ - JBUFFER_TRACE(jh, "on committing transaction"); - set_buffer_freed(bh); - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == - journal->j_running_transaction); - jh->b_next_transaction = NULL; + * The buffer is committing, we simply cannot touch + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); + log_wait_commit(journal, tid); + lock_buffer(bh); + goto retry; } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. + */ + set_buffer_freed(bh); + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1848,10 +1987,19 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * i_size already for this truncate so recovery will not * expose the disk blocks we are discarding here.) */ J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + JBUFFER_TRACE(jh, "on running transaction"); may_free = __dispose_buffer(jh, transaction); } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -1867,28 +2015,32 @@ zap_buffer_unlocked: return may_free; } -/** - * int journal_invalidatepage() - * @journal: journal to use for flush... +/** + * void journal_invalidatepage() - invalidate a journal page + * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. - * - * Return non-zero if the page's buffers were successfully reaped. + * Reap page buffers containing data in specified range in page. */ -int journal_invalidatepage(journal_t *journal, - struct page *page, - unsigned long offset) +void journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) BUG(); if (!page_has_buffers(page)) - return 1; + return; + + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be @@ -1899,10 +2051,14 @@ int journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { - /* This block is wholly outside the truncation point */ + /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -1910,16 +2066,14 @@ int journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { - if (!may_free || !try_to_free_buffers(page)) - return 0; - J_ASSERT(!page_has_buffers(page)); + if (!partial_page) { + if (may_free && try_to_free_buffers(page)) + J_ASSERT(!page_has_buffers(page)); } - return 1; } -/* - * File a buffer on the given transaction list. +/* + * File a buffer on the given transaction list. */ void __journal_file_buffer(struct journal_head *jh, transaction_t *transaction, int jlist) @@ -1933,17 +2087,22 @@ void __journal_file_buffer(struct journal_head *jh, J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); J_ASSERT_JH(jh, jh->b_transaction == transaction || - jh->b_transaction == 0); + jh->b_transaction == NULL); if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - - if (jlist == BJ_Metadata || jlist == BJ_Reserved || + if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; @@ -1951,6 +2110,8 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __journal_temp_unlink_buffer(jh); + else + journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2002,19 +2163,20 @@ void journal_file_buffer(struct journal_head *jh, jbd_unlock_bh_state(jh2bh(jh)); } -/* +/* * Remove a buffer from its current buffer list in preparation for * dropping it from its current transaction entirely. If the buffer has * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __journal_refile_buffer(struct journal_head *jh) { - int was_dirty; + int was_dirty, jlist; struct buffer_head *bh = jh2bh(jh); J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); @@ -2034,9 +2196,20 @@ void __journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __journal_file_buffer() must not take a + * new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; - __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + if (buffer_freed(bh)) + jlist = BJ_Forget; + else if (jh->b_modified) + jlist = BJ_Metadata; + else + jlist = BJ_Reserved; + __journal_file_buffer(jh, jh->b_transaction, jlist); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) @@ -2044,30 +2217,21 @@ void __journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __journal_refile_buffer() with necessary locking added. We take our bh + * reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } |
