aboutsummaryrefslogtreecommitdiff
path: root/fs/jbd/transaction.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd/transaction.c')
-rw-r--r--fs/jbd/transaction.c503
1 files changed, 330 insertions, 173 deletions
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index e1b3c8af4d1..1695ba8334a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1,5 +1,5 @@
/*
- * linux/fs/transaction.c
+ * linux/fs/jbd/transaction.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
@@ -23,9 +23,11 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/timer.h>
-#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/hrtimer.h>
+
+static void __journal_temp_unlink_buffer(struct journal_head *jh);
/*
* get_transaction: obtain a new transaction_t object.
@@ -48,12 +50,14 @@ get_transaction(journal_t *journal, transaction_t *transaction)
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
+ transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = transaction->t_expires;
+ journal->j_commit_timer.expires =
+ round_jiffies_up(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
@@ -95,13 +99,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = jbd_kmalloc(sizeof(*new_transaction),
- GFP_NOFS);
+ new_transaction = kzalloc(sizeof(*new_transaction),
+ GFP_NOFS|__GFP_NOFAIL);
if (!new_transaction) {
ret = -ENOMEM;
goto out;
}
- memset(new_transaction, 0, sizeof(*new_transaction));
}
jbd_debug(3, "New handle %p going live.\n", handle);
@@ -204,7 +207,7 @@ repeat_locked:
* the committing transaction. Really, we only need to give it
* committing_transaction->t_outstanding_credits plus "enough" for
* the log control blocks.
- * Also, this test is inconsitent with the matching one in
+ * Also, this test is inconsistent with the matching one in
* journal_extend().
*/
if (__log_space_left(journal) < jbd_space_needed(journal)) {
@@ -226,22 +229,27 @@ repeat_locked:
__log_space_left(journal));
spin_unlock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
+
+ lock_map_acquire(&handle->h_lockdep_map);
out:
if (unlikely(new_transaction)) /* It's usually NULL */
kfree(new_transaction);
return ret;
}
+static struct lock_class_key jbd_handle_key;
+
/* Allocate a new handle. This should probably be in a slab... */
static handle_t *new_handle(int nblocks)
{
handle_t *handle = jbd_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
+ lockdep_init_map(&handle->h_lockdep_map, "jbd_handle", &jbd_handle_key, 0);
+
return handle;
}
@@ -257,7 +265,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
*/
handle_t *journal_start(journal_t *journal, int nblocks)
{
@@ -360,7 +369,7 @@ out:
/**
- * int journal_restart() - restart a handle .
+ * int journal_restart() - restart a handle.
* @handle: handle to restart
* @nblocks: nr credits requested
*
@@ -405,6 +414,7 @@ int journal_restart(handle_t *handle, int nblocks)
__log_start_commit(journal, transaction->t_tid);
spin_unlock(&journal->j_state_lock);
+ lock_map_release(&handle->h_lockdep_map);
handle->h_buffer_credits = nblocks;
ret = start_this_handle(journal, handle);
return ret;
@@ -415,17 +425,34 @@ int journal_restart(handle_t *handle, int nblocks)
* void journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
- * This locks out any further updates from being started, and blocks
- * until all existing updates have completed, returning only once the
- * journal is in a quiescent state with no updates running.
+ * This locks out any further updates from being started, and blocks until all
+ * existing updates have completed, returning only once the journal is in a
+ * quiescent state with no updates running.
*
- * The journal lock should not be held on entry.
+ * We do not use simple mutex for synchronization as there are syscalls which
+ * want to return with filesystem locked and that trips up lockdep. Also
+ * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
+ * Since locking filesystem is rare operation, we use simple counter and
+ * waitqueue for locking.
*/
void journal_lock_updates(journal_t *journal)
{
DEFINE_WAIT(wait);
+wait:
+ /* Wait for previous locked operation to finish */
+ wait_event(journal->j_wait_transaction_locked,
+ journal->j_barrier_count == 0);
+
spin_lock(&journal->j_state_lock);
+ /*
+ * Check reliably under the lock whether we are the ones winning the race
+ * and locking the journal
+ */
+ if (journal->j_barrier_count > 0) {
+ spin_unlock(&journal->j_state_lock);
+ goto wait;
+ }
++journal->j_barrier_count;
/* Wait until there are no running updates */
@@ -449,14 +476,6 @@ void journal_lock_updates(journal_t *journal)
spin_lock(&journal->j_state_lock);
}
spin_unlock(&journal->j_state_lock);
-
- /*
- * We have now established a barrier against other normal updates, but
- * we also need to barrier against other journal_lock_updates() calls
- * to make sure that we serialise special journal-locked operations
- * too.
- */
- mutex_lock(&journal->j_barrier);
}
/**
@@ -464,48 +483,26 @@ void journal_lock_updates(journal_t *journal)
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with journal_lock_updates().
- *
- * Should be called without the journal lock held.
*/
void journal_unlock_updates (journal_t *journal)
{
J_ASSERT(journal->j_barrier_count != 0);
- mutex_unlock(&journal->j_barrier);
spin_lock(&journal->j_state_lock);
--journal->j_barrier_count;
spin_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_transaction_locked);
}
-/*
- * Report any unexpected dirty buffers which turn up. Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible. #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+static void warn_dirty_buffer(struct buffer_head *bh)
{
- int jlist;
-
- /* If this buffer is one which might reasonably be dirty
- * --- ie. data, or not part of this journal --- then
- * we're OK to leave it alone, but otherwise we need to
- * move the dirty bit to the journal's own internal
- * JBDDirty bit. */
- jlist = jh->b_jlist;
-
- if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
- jlist == BJ_Shadow || jlist == BJ_Forget) {
- struct buffer_head *bh = jh2bh(jh);
+ char b[BDEVNAME_SIZE];
- if (test_clear_buffer_dirty(bh))
- set_buffer_jbddirty(bh);
- }
+ printk(KERN_WARNING
+ "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+ "There's a risk of filesystem corruption in case of system "
+ "crash.\n",
+ bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
/*
@@ -535,7 +532,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
transaction = handle->h_transaction;
journal = transaction->t_journal;
- jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+ jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
JBUFFER_TRACE(jh, "entry");
repeat:
@@ -572,14 +569,16 @@ repeat:
if (jh->b_next_transaction)
J_ASSERT_JH(jh, jh->b_next_transaction ==
transaction);
+ warn_dirty_buffer(bh);
}
/*
* In any case we need to clean the dirty flag and we must
* do it under the buffer lock to be sure we don't race
* with running write-out.
*/
- JBUFFER_TRACE(jh, "Unexpected dirty buffer");
- jbd_unexpected_dirty_buffer(jh);
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ clear_buffer_dirty(bh);
+ set_buffer_jbddirty(bh);
}
unlock_buffer(bh);
@@ -600,6 +599,12 @@ repeat:
goto done;
/*
+ * this is the first time this transaction is touching this buffer,
+ * reset the modified flag
+ */
+ jh->b_modified = 0;
+
+ /*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
@@ -667,12 +672,12 @@ repeat:
JBUFFER_TRACE(jh, "allocate memory for buffer");
jbd_unlock_bh_state(bh);
frozen_buffer =
- jbd_slab_alloc(jh2bh(jh)->b_size,
+ jbd_alloc(jh2bh(jh)->b_size,
GFP_NOFS);
if (!frozen_buffer) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: OOM for frozen_buffer\n",
- __FUNCTION__);
+ __func__);
JBUFFER_TRACE(jh, "oom!");
error = -ENOMEM;
jbd_lock_bh_state(bh);
@@ -696,7 +701,6 @@ repeat:
if (!jh->b_transaction) {
JBUFFER_TRACE(jh, "no transaction");
J_ASSERT_JH(jh, !jh->b_next_transaction);
- jh->b_transaction = transaction;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
spin_lock(&journal->j_list_lock);
__journal_file_buffer(jh, transaction, BJ_Reserved);
@@ -712,10 +716,10 @@ done:
J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
"Possible IO failure.\n");
page = jh2bh(jh)->b_page;
- offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
- source = kmap_atomic(page, KM_USER0);
+ offset = offset_in_page(jh2bh(jh)->b_data);
+ source = kmap_atomic(page);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source, KM_USER0);
+ kunmap_atomic(source);
}
jbd_unlock_bh_state(bh);
@@ -727,7 +731,7 @@ done:
out:
if (unlikely(frozen_buffer)) /* It's usually NULL */
- jbd_slab_free(frozen_buffer, bh->b_size);
+ jbd_free(frozen_buffer, bh->b_size);
JBUFFER_TRACE(jh, "exit");
return error;
@@ -737,7 +741,6 @@ out:
* int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
*
* Returns an error code or 0 on success.
*
@@ -810,10 +813,25 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
if (jh->b_transaction == NULL) {
- jh->b_transaction = transaction;
+ /*
+ * Previous journal_forget() could have left the buffer
+ * with jbddirty bit set because it was being committed. When
+ * the commit finished, we've filed the buffer for
+ * checkpointing and marked it dirty. Now we are reallocating
+ * the buffer so the transaction freeing it must have
+ * committed and so it's safe to clear the dirty bit.
+ */
+ clear_buffer_dirty(jh2bh(jh));
+
+ /* first access by this transaction */
+ jh->b_modified = 0;
+
JBUFFER_TRACE(jh, "file as BJ_Reserved");
__journal_file_buffer(jh, transaction, BJ_Reserved);
} else if (jh->b_transaction == journal->j_committing_transaction) {
+ /* first access by this transaction */
+ jh->b_modified = 0;
+
JBUFFER_TRACE(jh, "set next transaction");
jh->b_next_transaction = transaction;
}
@@ -829,17 +847,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/
JBUFFER_TRACE(jh, "cancelling revoke");
journal_cancel_revoke(handle, jh);
- journal_put_journal_head(jh);
out:
+ journal_put_journal_head(jh);
return err;
}
/**
- * int journal_get_undo_access() - Notify intent to modify metadata with
- * non-rewindable consequences
+ * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
*
* Sometimes there is a need to distinguish between metadata which has
* been committed to disk and that which has not. The ext3fs code uses
@@ -880,10 +896,10 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
repeat:
if (!jh->b_committed_data) {
- committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+ committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
- printk(KERN_EMERG "%s: No memory for committed data\n",
- __FUNCTION__);
+ printk(KERN_ERR "%s: No memory for committed data\n",
+ __func__);
err = -ENOMEM;
goto out;
}
@@ -907,17 +923,19 @@ repeat:
out:
journal_put_journal_head(jh);
if (unlikely(committed_data))
- jbd_slab_free(committed_data, bh->b_size);
+ jbd_free(committed_data, bh->b_size);
return err;
}
/**
- * int journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
+ * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
* @handle: transaction
* @bh: bufferhead to mark
*
+ * Description:
+ * Mark a buffer as containing dirty data which needs to be flushed before
+ * we can commit the current transaction.
+ *
* The buffer is placed on the transaction's data list and is marked as
* belonging to the transaction.
*
@@ -931,9 +949,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int ret = 0;
if (is_handle_aborted(handle))
- return 0;
+ return ret;
jh = journal_add_journal_head(bh);
JBUFFER_TRACE(jh, "entry");
@@ -967,6 +986,13 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
*/
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
+
+ /* Now that we have bh_state locked, are we really still mapped? */
+ if (!buffer_mapped(bh)) {
+ JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
+ goto no_journal;
+ }
+
if (jh->b_transaction) {
JBUFFER_TRACE(jh, "has transaction");
if (jh->b_transaction != handle->h_transaction) {
@@ -1028,12 +1054,27 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
sync_dirty_buffer(bh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
+ /* Since we dropped the lock... */
+ if (!buffer_mapped(bh)) {
+ JBUFFER_TRACE(jh, "buffer got unmapped");
+ goto no_journal;
+ }
/* The buffer may become locked again at any
time if it is redirtied */
}
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
+ /*
+ * We cannot remove the buffer with io error from the
+ * committing transaction, because otherwise it would
+ * miss the error and the commit would not abort.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ ret = -EIO;
+ goto no_journal;
+ }
+ /* We might have slept so buffer could be refiled now */
+ if (jh->b_transaction != NULL &&
+ jh->b_transaction != handle->h_transaction) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
/* It still points to the committing
@@ -1054,8 +1095,6 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
JBUFFER_TRACE(jh, "file as data");
__journal_file_buffer(jh, handle->h_transaction,
BJ_SyncData);
@@ -1073,15 +1112,15 @@ no_journal:
}
JBUFFER_TRACE(jh, "exit");
journal_put_journal_head(jh);
- return 0;
+ return ret;
}
/**
- * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
- * mark dirty metadata which needs to be journaled as part of the current
+ * Mark dirty metadata which needs to be journaled as part of the current
* transaction.
*
* The buffer is placed on the transaction's metadata list and is marked
@@ -1152,7 +1191,7 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
}
/* That test should have eliminated the following case: */
- J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
@@ -1200,6 +1239,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
+ int was_modified = 0;
BUFFER_TRACE(bh, "entry");
@@ -1218,6 +1258,9 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
goto not_jbd;
}
+ /* keep track of whether or not this transaction modified us */
+ was_modified = jh->b_modified;
+
/*
* The buffer's going from the transaction, we must drop
* all references -bzzz
@@ -1235,7 +1278,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
- drop_reserve = 1;
+ /*
+ * we only want to drop a reference if this transaction
+ * modified the buffer
+ */
+ if (was_modified)
+ drop_reserve = 1;
/*
* We are no longer going to journal this buffer.
@@ -1254,8 +1302,6 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
__journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__journal_unfile_buffer(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -1275,7 +1321,13 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction == transaction);
jh->b_next_transaction = NULL;
- drop_reserve = 1;
+
+ /*
+ * only drop a reference if this transaction modified
+ * the buffer
+ */
+ if (was_modified)
+ drop_reserve = 1;
}
}
@@ -1311,16 +1363,17 @@ int journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
- int old_handle_count, err;
+ int err;
pid_t pid;
- J_ASSERT(transaction->t_updates > 0);
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else
+ else {
+ J_ASSERT(transaction->t_updates > 0);
err = 0;
+ }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1339,6 +1392,17 @@ int journal_stop(handle_t *handle)
* on IO anyway. Speeds up many-threaded, many-dir operations
* by 30x or more...
*
+ * We try and optimize the sleep time against what the underlying disk
+ * can do, instead of having a static sleep time. This is useful for
+ * the case where our storage is so fast that it is more optimal to go
+ * ahead and force a flush and wait for the transaction to be committed
+ * than it is to wait for an arbitrary amount of time for new writers to
+ * join the transaction. We achieve this by measuring how long it takes
+ * to commit a transaction, and compare it with how long this
+ * transaction has been running, and if run time < commit time then we
+ * sleep for the delta and commit. This greatly helps super fast disks
+ * that would see slowdowns as more threads started doing fsyncs.
+ *
* But don't do this if this process was the most recent one to
* perform a synchronous write. We do this to detect the case where a
* single process is doing a stream of sync writes. No point in waiting
@@ -1346,11 +1410,26 @@ int journal_stop(handle_t *handle)
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ u64 commit_time, trans_time;
+
journal->j_last_sync_writer = pid;
- do {
- old_handle_count = transaction->t_handle_count;
- schedule_timeout_uninterruptible(1);
- } while (old_handle_count != transaction->t_handle_count);
+
+ spin_lock(&journal->j_state_lock);
+ commit_time = journal->j_average_commit_time;
+ spin_unlock(&journal->j_state_lock);
+
+ trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+ transaction->t_start_time));
+
+ commit_time = min_t(u64, commit_time,
+ 1000*jiffies_to_usecs(1));
+
+ if (trans_time < commit_time) {
+ ktime_t expires = ktime_add_ns(ktime_get(),
+ commit_time);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
}
current->journal_info = NULL;
@@ -1397,11 +1476,14 @@ int journal_stop(handle_t *handle)
spin_unlock(&journal->j_state_lock);
}
+ lock_map_release(&handle->h_lockdep_map);
+
jbd_free_handle(handle);
return err;
}
-/**int journal_force_commit() - force any uncommitted transactions
+/**
+ * int journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* For synchronous operations: force any uncommitted transactions
@@ -1486,7 +1568,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
*
* Called under j_list_lock. The journal may not be locked.
*/
-void __journal_temp_unlink_buffer(struct journal_head *jh)
+static void __journal_temp_unlink_buffer(struct journal_head *jh)
{
struct journal_head **list = NULL;
transaction_t *transaction;
@@ -1499,7 +1581,7 @@ void __journal_temp_unlink_buffer(struct journal_head *jh)
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
if (jh->b_jlist != BJ_None)
- J_ASSERT_JH(jh, transaction != 0);
+ J_ASSERT_JH(jh, transaction != NULL);
switch (jh->b_jlist) {
case BJ_None:
@@ -1538,19 +1620,32 @@ void __journal_temp_unlink_buffer(struct journal_head *jh)
mark_buffer_dirty(bh); /* Expose it to the VM */
}
+/*
+ * Remove buffer from all transactions.
+ *
+ * Called with bh_state lock and j_list_lock
+ *
+ * jh and bh may be already freed when this function returns.
+ */
void __journal_unfile_buffer(struct journal_head *jh)
{
__journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
+ journal_put_journal_head(jh);
}
void journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
{
- jbd_lock_bh_state(jh2bh(jh));
+ struct buffer_head *bh = jh2bh(jh);
+
+ /* Get reference so that buffer cannot be freed before we unlock it */
+ get_bh(bh);
+ jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
__journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ jbd_unlock_bh_state(bh);
+ __brelse(bh);
}
/*
@@ -1568,25 +1663,21 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
- if (jh->b_next_transaction != 0)
+ if (jh->b_next_transaction != NULL)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+ if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
}
- } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
__journal_remove_checkpoint(jh);
- journal_remove_journal_head(bh);
- __brelse(bh);
}
}
spin_unlock(&journal->j_list_lock);
@@ -1594,12 +1685,13 @@ out:
return;
}
-
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1628,9 +1720,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1646,7 +1740,7 @@ int journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * journal_remove_journal_head() and journal_put_journal_head().
+ * journal_put_journal_head().
*/
jh = journal_grab_journal_head(bh);
if (!jh)
@@ -1659,7 +1753,9 @@ int journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
busy:
return ret;
}
@@ -1681,17 +1777,20 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
int may_free = 1;
struct buffer_head *bh = jh2bh(jh);
- __journal_unfile_buffer(jh);
-
if (jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "on running+cp transaction");
+ __journal_temp_unlink_buffer(jh);
+ /*
+ * We don't want to write the buffer anymore, clear the
+ * bit so that we don't confuse checks in
+ * __journal_file_buffer
+ */
+ clear_buffer_dirty(bh);
__journal_file_buffer(jh, transaction, BJ_Forget);
- clear_buffer_jbddirty(bh);
may_free = 0;
} else {
JBUFFER_TRACE(jh, "on running transaction");
- journal_remove_journal_head(bh);
- __brelse(bh);
+ __journal_unfile_buffer(jh);
}
return may_free;
}
@@ -1743,15 +1842,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+ int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
- int ret;
BUFFER_TRACE(bh, "entry");
+retry:
/*
* It is safe to proceed here without the j_list_lock because the
* buffers cannot be stolen by try_to_free_buffers as long as we are
@@ -1769,6 +1869,29 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!jh)
goto zap_buffer_no_jh;
+ /*
+ * We cannot remove the buffer from checkpoint lists until the
+ * transaction adding inode to orphan list (let's call it T)
+ * is committed. Otherwise if the transaction changing the
+ * buffer would be cleaned from the journal before T is
+ * committed, a crash will cause that the correct contents of
+ * the buffer will be lost. On the other hand we have to
+ * clear the buffer dirty bit at latest at the moment when the
+ * transaction marking the buffer as freed in the filesystem
+ * structures is committed because from that moment on the
+ * block can be reallocated and used by a different page.
+ * Since the block hasn't been freed yet but the inode has
+ * already been added to orphan list, it is safe for us to add
+ * the buffer to BJ_Forget list of the newest transaction.
+ *
+ * Also we have to clear buffer_mapped flag of a truncated buffer
+ * because the buffer_head may be attached to the page straddling
+ * i_size (can happen only when blocksize < pagesize) and thus the
+ * buffer_head can be reused when the file is extended again. So we end
+ * up keeping around invalidated buffers attached to transactions'
+ * BJ_Forget list just to stop checkpointing code from cleaning up
+ * the transaction this buffer was modified in.
+ */
transaction = jh->b_transaction;
if (transaction == NULL) {
/* First case: not on any transaction. If it
@@ -1794,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_running_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
@@ -1808,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
- journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- spin_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
@@ -1823,6 +1938,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
}
} else if (transaction == journal->j_committing_transaction) {
+ JBUFFER_TRACE(jh, "on committing transaction");
if (jh->b_jlist == BJ_Locked) {
/*
* The buffer is on the committing transaction's locked
@@ -1833,17 +1949,31 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
goto zap_buffer;
}
/*
- * If it is committing, we simply cannot touch it. We
- * can remove it's next_transaction pointer from the
- * running transaction if that is set, but nothing
- * else. */
- JBUFFER_TRACE(jh, "on committing transaction");
- set_buffer_freed(bh);
- if (jh->b_next_transaction) {
- J_ASSERT(jh->b_next_transaction ==
- journal->j_running_transaction);
- jh->b_next_transaction = NULL;
+ * The buffer is committing, we simply cannot touch
+ * it. If the page is straddling i_size we have to wait
+ * for commit and try again.
+ */
+ if (partial_page) {
+ tid_t tid = journal->j_committing_transaction->t_tid;
+
+ journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ spin_unlock(&journal->j_state_lock);
+ unlock_buffer(bh);
+ log_wait_commit(journal, tid);
+ lock_buffer(bh);
+ goto retry;
}
+ /*
+ * OK, buffer won't be reachable after truncate. We just set
+ * j_next_transaction to the running transaction (if there is
+ * one) and mark buffer as freed so that commit code knows it
+ * should clear dirty bits when it is done with the buffer.
+ */
+ set_buffer_freed(bh);
+ if (journal->j_running_transaction && buffer_jbddirty(bh))
+ jh->b_next_transaction = journal->j_running_transaction;
journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -1857,10 +1987,19 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* i_size already for this truncate so recovery will not
* expose the disk blocks we are discarding here.) */
J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+ JBUFFER_TRACE(jh, "on running transaction");
may_free = __dispose_buffer(jh, transaction);
}
zap_buffer:
+ /*
+ * This is tricky. Although the buffer is truncated, it may be reused
+ * if blocksize < pagesize and it is attached to the page straddling
+ * EOF. Since the buffer might have been added to BJ_Forget list of the
+ * running transaction, journal_get_write_access() won't clear
+ * b_modified and credit accounting gets confused. So clear b_modified
+ * here. */
+ jh->b_modified = 0;
journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
@@ -1877,20 +2016,23 @@ zap_buffer_unlocked:
}
/**
- * void journal_invalidatepage()
- * @journal: journal to use for flush...
+ * void journal_invalidatepage() - invalidate a journal page
+ * @journal: journal to use for flush
* @page: page to flush
- * @offset: length of page to invalidate.
- *
- * Reap page buffers containing data after offset in page.
+ * @offset: offset of the range to invalidate
+ * @length: length of the range to invalidate
*
+ * Reap page buffers containing data in specified range in page.
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
- unsigned long offset)
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
if (!PageLocked(page))
@@ -1898,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal,
if (!page_has_buffers(page))
return;
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
@@ -1907,10 +2051,14 @@ void journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh);
+ may_free &= journal_unmap_buffer(journal, bh,
+ partial_page);
unlock_buffer(bh);
}
curr_off = next_off;
@@ -1918,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal,
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
@@ -1939,17 +2087,22 @@ void __journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
- jh->b_transaction == 0);
+ jh->b_transaction == NULL);
if (jh->b_transaction && jh->b_jlist == jlist)
return;
- /* The following list of buffer states needs to be consistent
- * with __jbd_unexpected_dirty_buffer()'s handling of dirty
- * state. */
-
if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
jlist == BJ_Shadow || jlist == BJ_Forget) {
+ /*
+ * For metadata buffers, we track dirty bit in buffer_jbddirty
+ * instead of buffer_dirty. We should not see a dirty bit set
+ * here because we clear it in do_get_write_access but e.g.
+ * tune2fs can modify the sb and set the dirty bit at any time
+ * so we try to gracefully handle that.
+ */
+ if (buffer_dirty(bh))
+ warn_dirty_buffer(bh);
if (test_clear_buffer_dirty(bh) ||
test_clear_buffer_jbddirty(bh))
was_dirty = 1;
@@ -1957,6 +2110,8 @@ void __journal_file_buffer(struct journal_head *jh,
if (jh->b_transaction)
__journal_temp_unlink_buffer(jh);
+ else
+ journal_grab_journal_head(bh);
jh->b_transaction = transaction;
switch (jlist) {
@@ -2014,13 +2169,14 @@ void journal_file_buffer(struct journal_head *jh,
* already started to be used by a subsequent transaction, refile the
* buffer on that transaction's metadata list.
*
- * Called under journal->j_list_lock
- *
+ * Called under j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh))
+ *
+ * jh and bh may be already free when this function returns
*/
void __journal_refile_buffer(struct journal_head *jh)
{
- int was_dirty;
+ int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
@@ -2040,10 +2196,20 @@ void __journal_refile_buffer(struct journal_head *jh)
was_dirty = test_clear_buffer_jbddirty(bh);
__journal_temp_unlink_buffer(jh);
+ /*
+ * We set b_transaction here because b_next_transaction will inherit
+ * our jh reference and thus __journal_file_buffer() must not take a
+ * new one.
+ */
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
- __journal_file_buffer(jh, jh->b_transaction,
- was_dirty ? BJ_Metadata : BJ_Reserved);
+ if (buffer_freed(bh))
+ jlist = BJ_Forget;
+ else if (jh->b_modified)
+ jlist = BJ_Metadata;
+ else
+ jlist = BJ_Reserved;
+ __journal_file_buffer(jh, jh->b_transaction, jlist);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)
@@ -2051,30 +2217,21 @@ void __journal_refile_buffer(struct journal_head *jh)
}
/*
- * For the unlocked version of this call, also make sure that any
- * hanging journal_head is cleaned up if necessary.
- *
- * __journal_refile_buffer is usually called as part of a single locked
- * operation on a buffer_head, in which the caller is probably going to
- * be hooking the journal_head onto other lists. In that case it is up
- * to the caller to remove the journal_head if necessary. For the
- * unlocked journal_refile_buffer call, the caller isn't going to be
- * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak.
- *
- * *** The journal_head may be freed by this call! ***
+ * __journal_refile_buffer() with necessary locking added. We take our bh
+ * reference so that we can safely unlock bh.
+ *
+ * The jh and bh may be freed by this call.
*/
void journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
+ /* Get reference so that buffer cannot be freed before we unlock it */
+ get_bh(bh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
-
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
-
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}