aboutsummaryrefslogtreecommitdiff
path: root/fs/jbd2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2')
-rw-r--r--fs/jbd2/Kconfig8
-rw-r--r--fs/jbd2/checkpoint.c162
-rw-r--r--fs/jbd2/commit.c415
-rw-r--r--fs/jbd2/journal.c797
-rw-r--r--fs/jbd2/recovery.c133
-rw-r--r--fs/jbd2/revoke.c76
-rw-r--r--fs/jbd2/transaction.c768
7 files changed, 1491 insertions, 868 deletions
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index f32f346f4b0..5a9f5534d57 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,6 +1,8 @@
config JBD2
tristate
select CRC32
+ select CRYPTO
+ select CRYPTO_CRC32C
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
@@ -18,7 +20,7 @@ config JBD2
config JBD2_DEBUG
bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
+ depends on JBD2
help
If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
@@ -27,7 +29,7 @@ config JBD2_DEBUG
By default, the debugging output will be turned off.
If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ with "echo N > /sys/module/jbd2/parameters/jbd2_debug", where N is a
number between 1 and 5. The higher the number, the more debugging
output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+ "echo 0 > /sys/module/jbd2/parameters/jbd2_debug".
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202903f..7f34f471616 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -88,14 +88,13 @@ static inline void __buffer_relink_io(struct journal_head *jh)
* whole transaction.
*
* Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __try_to_free_cp_buf(struct journal_head *jh)
{
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
- if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+ if (jh->b_transaction == NULL && !buffer_locked(bh) &&
!buffer_dirty(bh) && !buffer_write_io_error(bh)) {
/*
* Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
get_bh(bh);
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
- jbd_unlock_bh_state(bh);
BUFFER_TRACE(bh, "release");
__brelse(bh);
- } else {
- jbd_unlock_bh_state(bh);
}
return ret;
}
@@ -124,8 +120,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
int nblocks, space_left;
/* assert_spin_locked(&journal->j_state_lock); */
- nblocks = jbd_space_needed(journal);
- while (__jbd2_log_space_left(journal) < nblocks) {
+ nblocks = jbd2_space_needed(journal);
+ while (jbd2_log_space_left(journal) < nblocks) {
if (journal->j_flags & JBD2_ABORT)
return;
write_unlock(&journal->j_state_lock);
@@ -144,8 +140,8 @@ void __jbd2_log_wait_for_space(journal_t *journal)
*/
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
- nblocks = jbd_space_needed(journal);
- space_left = __jbd2_log_space_left(journal);
+ nblocks = jbd2_space_needed(journal);
+ space_left = jbd2_log_space_left(journal);
if (space_left < nblocks) {
int chkpt = journal->j_checkpoint_transactions != NULL;
tid_t tid = 0;
@@ -160,7 +156,15 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/* We were able to recover space; yay! */
;
} else if (tid) {
+ /*
+ * jbd2_journal_commit_transaction() may want
+ * to take the checkpoint_mutex if JBD2_FLUSHED
+ * is set. So we need to temporarily drop it.
+ */
+ mutex_unlock(&journal->j_checkpoint_mutex);
jbd2_log_wait_commit(journal, tid);
+ write_lock(&journal->j_state_lock);
+ continue;
} else {
printk(KERN_ERR "%s: needed %d blocks and "
"only had %d space available\n",
@@ -180,21 +184,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
}
/*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk. Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
- __releases(journal->j_list_lock)
-{
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_lock_bh_state(bh);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
-}
-
-/*
* Clean up transaction's list of buffers submitted for io.
* We wait for any pending IO to complete and remove any clean
* buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +211,9 @@ restart:
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- spin_lock(&journal->j_list_lock);
- goto restart;
- }
get_bh(bh);
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
@@ -246,7 +229,6 @@ restart:
* it has been written out and so we can drop it from the list
*/
released = __jbd2_journal_remove_checkpoint(jh);
- jbd_unlock_bh_state(bh);
__brelse(bh);
}
@@ -266,7 +248,6 @@ __flush_batch(journal_t *journal, int *batch_count)
for (i = 0; i < *batch_count; i++) {
struct buffer_head *bh = journal->j_chkpt_bhs[i];
- clear_buffer_jwrite(bh);
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
}
@@ -281,7 +262,6 @@ __flush_batch(journal_t *journal, int *batch_count)
* be written out.
*
* Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
*/
static int __process_buffer(journal_t *journal, struct journal_head *jh,
int *batch_count, transaction_t *transaction)
@@ -292,7 +272,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
wait_on_buffer(bh);
/* the journal_head may have gone by now */
BUFFER_TRACE(bh, "brelse");
@@ -304,7 +283,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
transaction->t_chp_stats.cs_forced_to_close++;
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
if (unlikely(journal->j_flags & JBD2_UNMOUNT))
/*
* The journal thread is dead; so starting and
@@ -323,11 +301,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
if (unlikely(buffer_write_io_error(bh)))
ret = -EIO;
get_bh(bh);
- J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
__brelse(bh);
} else {
/*
@@ -340,10 +316,8 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
BUFFER_TRACE(bh, "queue");
get_bh(bh);
J_ASSERT_BH(bh, !buffer_jwrite(bh));
- set_buffer_jwrite(bh);
journal->j_chkpt_bhs[*batch_count] = bh;
__buffer_relink_io(jh);
- jbd_unlock_bh_state(bh);
transaction->t_chp_stats.cs_written++;
(*batch_count)++;
if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +381,7 @@ restart:
int retry = 0, err;
while (!retry && transaction->t_checkpoint_list) {
- struct buffer_head *bh;
-
jh = transaction->t_checkpoint_list;
- bh = jh2bh(jh);
- if (!jbd_trylock_bh_state(bh)) {
- jbd_sync_bh(journal, bh);
- retry = 1;
- break;
- }
retry = __process_buffer(journal, jh, &batch_count,
transaction);
if (retry < 0 && !result)
@@ -478,79 +444,28 @@ out:
int jbd2_cleanup_journal_tail(journal_t *journal)
{
- transaction_t * transaction;
tid_t first_tid;
- unsigned long blocknr, freed;
+ unsigned long blocknr;
if (is_journal_aborted(journal))
return 1;
- /* OK, work out the oldest transaction remaining in the log, and
- * the log block it starts at.
- *
- * If the log is now empty, we need to work out which is the
- * next transaction ID we will write, and where it will
- * start. */
-
- write_lock(&journal->j_state_lock);
- spin_lock(&journal->j_list_lock);
- transaction = journal->j_checkpoint_transactions;
- if (transaction) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_committing_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = transaction->t_log_start;
- } else if ((transaction = journal->j_running_transaction) != NULL) {
- first_tid = transaction->t_tid;
- blocknr = journal->j_head;
- } else {
- first_tid = journal->j_transaction_sequence;
- blocknr = journal->j_head;
- }
- spin_unlock(&journal->j_list_lock);
- J_ASSERT(blocknr != 0);
-
- /* If the oldest pinned transaction is at the tail of the log
- already then there's not much we can do right now. */
- if (journal->j_tail_sequence == first_tid) {
- write_unlock(&journal->j_state_lock);
+ if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
return 1;
- }
-
- /* OK, update the superblock to recover the freed space.
- * Physical blocks come first: have we wrapped beyond the end of
- * the log? */
- freed = blocknr - journal->j_tail;
- if (blocknr < journal->j_tail)
- freed = freed + journal->j_last - journal->j_first;
-
- trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
- jbd_debug(1,
- "Cleaning journal tail from %d to %d (offset %lu), "
- "freeing %lu\n",
- journal->j_tail_sequence, first_tid, blocknr, freed);
-
- journal->j_free += freed;
- journal->j_tail_sequence = first_tid;
- journal->j_tail = blocknr;
- write_unlock(&journal->j_state_lock);
+ J_ASSERT(blocknr != 0);
/*
- * If there is an external journal, we need to make sure that
- * any data blocks that were recently written out --- perhaps
- * by jbd2_log_do_checkpoint() --- are flushed out before we
- * drop the transactions from the external journal. It's
- * unlikely this will be necessary, especially with a
- * appropriately sized journal, but we need this to guarantee
- * correctness. Fortunately jbd2_cleanup_journal_tail()
- * doesn't get called all that often.
+ * We need to make sure that any blocks that were recently written out
+ * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
+ * we drop the transactions from the journal. It's unlikely this will
+ * be necessary, especially with an appropriately sized journal, but we
+ * need this to guarantee correctness. Fortunately
+ * jbd2_cleanup_journal_tail() doesn't get called all that often.
*/
- if ((journal->j_fs_dev != journal->j_dev) &&
- (journal->j_flags & JBD2_BARRIER))
+ if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
- if (!(journal->j_flags & JBD2_ABORT))
- jbd2_journal_update_superblock(journal, 1);
+
+ __jbd2_update_log_tail(journal, first_tid, blocknr);
return 0;
}
@@ -582,15 +497,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
do {
jh = next_jh;
next_jh = jh->b_cpnext;
- /* Use trylock because of the ranking */
- if (jbd_trylock_bh_state(jh2bh(jh))) {
- ret = __try_to_free_cp_buf(jh);
- if (ret) {
- freed++;
- if (ret == 2) {
- *released = 1;
- return freed;
- }
+ ret = __try_to_free_cp_buf(jh);
+ if (ret) {
+ freed++;
+ if (ret == 2) {
+ *released = 1;
+ return freed;
}
}
/*
@@ -673,9 +585,7 @@ out:
* The function can free jh and bh.
*
* This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
*/
-
int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
{
struct transaction_chp_stats_s *stats;
@@ -722,11 +632,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
transaction->t_tid, stats);
__jbd2_journal_drop_transaction(journal, transaction);
- kfree(transaction);
-
- /* Just in case anybody was waiting for more transactions to be
- checkpointed... */
- wake_up(&journal->j_wait_logspace);
+ jbd2_journal_free_transaction(transaction);
ret = 1;
out:
return ret;
@@ -788,14 +694,14 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
J_ASSERT(transaction->t_forget == NULL);
- J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
- J_ASSERT(transaction->t_log_list == NULL);
J_ASSERT(transaction->t_checkpoint_list == NULL);
J_ASSERT(transaction->t_checkpoint_io_list == NULL);
J_ASSERT(atomic_read(&transaction->t_updates) == 0);
J_ASSERT(journal->j_committing_transaction != transaction);
J_ASSERT(journal->j_running_transaction != transaction);
+ trace_jbd2_drop_transaction(journal, transaction);
+
jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5069b847515..6fac7434985 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -28,18 +28,24 @@
#include <linux/blkdev.h>
#include <linux/bitops.h>
#include <trace/events/jbd2.h>
-#include <asm/system.h>
/*
- * Default IO end handler for temporary BJ_IO buffer_heads.
+ * IO end handler for temporary buffer_heads handling writes to the journal.
*/
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
+ struct buffer_head *orig_bh = bh->b_private;
+
BUFFER_TRACE(bh, "");
if (uptodate)
set_buffer_uptodate(bh);
else
clear_buffer_uptodate(bh);
+ if (orig_bh) {
+ clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
+ smp_mb__after_atomic();
+ wake_up_bit(&orig_bh->b_state, BH_Shadow);
+ }
unlock_buffer(bh);
}
@@ -86,6 +92,22 @@ nope:
__brelse(bh);
}
+static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
+{
+ struct commit_header *h;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ h = (struct commit_header *)(bh->b_data);
+ h->h_chksum_type = 0;
+ h->h_chksum_size = 0;
+ h->h_chksum[0] = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ h->h_chksum[0] = cpu_to_be32(csum);
+}
+
/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
@@ -99,7 +121,6 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head **cbh,
__u32 crc32_sum)
{
- struct journal_head *descriptor;
struct commit_header *tmp;
struct buffer_head *bh;
int ret;
@@ -110,12 +131,10 @@ static int journal_submit_commit_record(journal_t *journal,
if (is_journal_aborted(journal))
return 0;
- descriptor = jbd2_journal_get_descriptor_buffer(journal);
- if (!descriptor)
+ bh = jbd2_journal_get_descriptor_buffer(journal);
+ if (!bh)
return 1;
- bh = jh2bh(descriptor);
-
tmp = (struct commit_header *)bh->b_data;
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
@@ -129,8 +148,9 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
+ jbd2_commit_block_csum_set(journal, bh);
- JBUFFER_TRACE(descriptor, "submit commit block");
+ BUFFER_TRACE(bh, "submit commit block");
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -162,7 +182,6 @@ static int journal_wait_on_commit_record(journal_t *journal,
if (unlikely(!buffer_uptodate(bh)))
ret = -EIO;
put_bh(bh); /* One for getblk() */
- jbd2_journal_put_journal_head(bh2jh(bh));
return ret;
}
@@ -220,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -258,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
}
spin_lock(&journal->j_list_lock);
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
@@ -286,10 +305,10 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
char *addr;
__u32 checksum;
- addr = kmap_atomic(page, KM_USER0);
+ addr = kmap_atomic(page);
checksum = crc32_be(crc32_sum,
(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr, KM_USER0);
+ kunmap_atomic(addr);
return checksum;
}
@@ -302,6 +321,43 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
+static void jbd2_descr_block_csum_set(journal_t *j,
+ struct buffer_head *bh)
+{
+ struct jbd2_journal_block_tail *tail;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ tail->t_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ tail->t_checksum = cpu_to_be32(csum);
+}
+
+static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
+ struct buffer_head *bh, __u32 sequence)
+{
+ struct page *page = bh->b_page;
+ __u8 *addr;
+ __u32 csum32;
+ __be32 seq;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ seq = cpu_to_be32(sequence);
+ addr = kmap_atomic(page);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
+ bh->b_size);
+ kunmap_atomic(addr);
+
+ /* We only have space to store the lower 16 bits of the crc32c. */
+ tag->t_checksum = cpu_to_be16(csum32);
+}
/*
* jbd2_journal_commit_transaction
*
@@ -312,7 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
{
struct transaction_stats_s stats;
transaction_t *commit_transaction;
- struct journal_head *jh, *new_jh, *descriptor;
+ struct journal_head *jh;
+ struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
@@ -326,11 +383,21 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
- int i, to_free = 0;
+ int i;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
struct blk_plug plug;
+ /* Tail of the journal */
+ unsigned long first_block;
+ tid_t first_tid;
+ int update_tail;
+ int csum_size = 0;
+ LIST_HEAD(io_bufs);
+ LIST_HEAD(log_bufs);
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ csum_size = sizeof(struct jbd2_journal_block_tail);
/*
* First job: lock down the current transaction and wait for
@@ -340,7 +407,18 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
jbd_debug(3, "super block updated\n");
- jbd2_journal_update_superblock(journal, 1);
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * We hold j_checkpoint_mutex so tail cannot change under us.
+ * We don't need any special data guarantees for writing sb
+ * since journal is empty and it is ok for write to be
+ * flushed only with transaction commit.
+ */
+ jbd2_journal_update_sb_log_tail(journal,
+ journal->j_tail_sequence,
+ journal->j_tail,
+ WRITE_SYNC);
+ mutex_unlock(&journal->j_checkpoint_mutex);
} else {
jbd_debug(3, "superblock not updated\n");
}
@@ -349,18 +427,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
commit_transaction->t_tid);
write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_RUNNING);
commit_transaction->t_state = T_LOCKED;
trace_jbd2_commit_locking(journal, commit_transaction);
stats.run.rs_wait = commit_transaction->t_max_wait;
+ stats.run.rs_request_delay = 0;
stats.run.rs_locked = jiffies;
+ if (commit_transaction->t_requested)
+ stats.run.rs_request_delay =
+ jbd2_time_diff(commit_transaction->t_requested,
+ stats.run.rs_locked);
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
@@ -440,6 +523,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ /*
+ * Reserved credits cannot be claimed anymore, free them
+ */
+ atomic_sub(atomic_read(&journal->j_reserved_credits),
+ &commit_transaction->t_outstanding_credits);
+
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -453,7 +542,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
wake_up(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2a\n");
/*
* Now start flushing things to disk, in the order they appear
@@ -465,10 +554,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
- WRITE_SYNC);
- blk_finish_plug(&plug);
+ &log_bufs, WRITE_SYNC);
- jbd_debug(3, "JBD2: commit phase 2\n");
+ jbd_debug(3, "JBD2: commit phase 2b\n");
/*
* Way to go: we have now written out all of the data for a
@@ -491,9 +579,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
atomic_read(&commit_transaction->t_outstanding_credits));
err = 0;
- descriptor = NULL;
bufs = 0;
- blk_start_plug(&plug);
+ descriptor = NULL;
while (commit_transaction->t_buffers) {
/* Find the next buffer to be journaled... */
@@ -524,8 +611,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
record the metadata buffer. */
if (!descriptor) {
- struct buffer_head *bh;
-
J_ASSERT (bufs == 0);
jbd_debug(4, "JBD2: get descriptor\n");
@@ -536,26 +621,26 @@ void jbd2_journal_commit_transaction(journal_t *journal)
continue;
}
- bh = jh2bh(descriptor);
jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
- (unsigned long long)bh->b_blocknr, bh->b_data);
- header = (journal_header_t *)&bh->b_data[0];
+ (unsigned long long)descriptor->b_blocknr,
+ descriptor->b_data);
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- tagp = &bh->b_data[sizeof(journal_header_t)];
- space_left = bh->b_size - sizeof(journal_header_t);
+ tagp = &descriptor->b_data[sizeof(journal_header_t)];
+ space_left = descriptor->b_size -
+ sizeof(journal_header_t);
first_tag = 1;
- set_buffer_jwrite(bh);
- set_buffer_dirty(bh);
- wbuf[bufs++] = bh;
+ set_buffer_jwrite(descriptor);
+ set_buffer_dirty(descriptor);
+ wbuf[bufs++] = descriptor;
/* Record it so that we can wait for IO
completion later */
- BUFFER_TRACE(bh, "ph3: file as descriptor");
- jbd2_journal_file_buffer(descriptor, commit_transaction,
- BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "ph3: file as descriptor");
+ jbd2_file_log_bh(&log_bufs, descriptor);
}
/* Where is the buffer to be written? */
@@ -578,29 +663,22 @@ void jbd2_journal_commit_transaction(journal_t *journal)
/* Bump b_count to prevent truncate from stumbling over
the shadowed buffer! @@@ This can go if we ever get
- rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ rid of the shadow pairing of buffers. */
atomic_inc(&jh2bh(jh)->b_count);
- /* Make a temporary IO buffer with which to write it out
- (this will requeue both the metadata buffer and the
- temporary IO buffer). new_bh goes on BJ_IO*/
-
- set_bit(BH_JWrite, &jh2bh(jh)->b_state);
/*
- * akpm: jbd2_journal_write_metadata_buffer() sets
- * new_bh->b_transaction to commit_transaction.
- * We need to clean this up before we release new_bh
- * (which is of type BJ_IO)
+ * Make a temporary IO buffer with which to write it out
+ * (this will requeue the metadata buffer to BJ_Shadow).
*/
+ set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
- jh, &new_jh, blocknr);
+ jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
continue;
}
- set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
- wbuf[bufs++] = jh2bh(new_jh);
+ jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
buffer */
@@ -613,9 +691,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
- tag->t_flags = cpu_to_be32(tag_flag);
+ tag->t_flags = cpu_to_be16(tag_flag);
+ jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
+ commit_transaction->t_tid);
tagp += tag_bytes;
space_left -= tag_bytes;
+ bufs++;
if (first_tag) {
memcpy (tagp, journal->j_uuid, 16);
@@ -629,7 +710,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (bufs == journal->j_wbufsize ||
commit_transaction->t_buffers == NULL ||
- space_left < tag_bytes + 16) {
+ space_left < tag_bytes + 16 + csum_size) {
jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
@@ -637,8 +718,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
submitting the IOs. "tag" still points to
the last tag we set up. */
- tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
+ jbd2_descr_block_csum_set(journal, descriptor);
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
@@ -677,10 +759,30 @@ start_journal_io:
err = 0;
}
+ /*
+ * Get current oldest transaction in the log before we issue flush
+ * to the filesystem device. After the flush we can be sure that
+ * blocks of all older transactions are checkpointed to persistent
+ * storage and we will be safe to update journal start in the
+ * superblock with the numbers we get here.
+ */
+ update_tail =
+ jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
+
write_lock(&journal->j_state_lock);
+ if (update_tail) {
+ long freed = first_block - journal->j_tail;
+
+ if (first_block < journal->j_tail)
+ freed += journal->j_last - journal->j_first;
+ /* Update tail only if we free significant amount of space */
+ if (freed < journal->j_maxlen / 4)
+ update_tail = 0;
+ }
J_ASSERT(commit_transaction->t_state == T_COMMIT);
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
+
/*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
@@ -689,7 +791,7 @@ start_journal_io:
if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -706,7 +808,7 @@ start_journal_io:
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
transaction's t_log_list queue, and metadata buffers are on
- the t_iobuf_list queue.
+ the io_bufs list.
Wait for the buffers in reverse order. That way we are
less likely to be woken up until all IOs have completed, and
@@ -715,47 +817,33 @@ start_journal_io:
jbd_debug(3, "JBD2: commit phase 3\n");
- /*
- * akpm: these are BJ_IO, and j_list_lock is not needed.
- * See __journal_try_to_free_buffer.
- */
-wait_for_iobuf:
- while (commit_transaction->t_iobuf_list != NULL) {
- struct buffer_head *bh;
+ while (!list_empty(&io_bufs)) {
+ struct buffer_head *bh = list_entry(io_bufs.prev,
+ struct buffer_head,
+ b_assoc_buffers);
- jh = commit_transaction->t_iobuf_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_iobuf;
- }
- if (cond_resched())
- goto wait_for_iobuf;
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
-
- clear_buffer_jwrite(bh);
-
- JBUFFER_TRACE(jh, "ph4: unfile after journal write");
- jbd2_journal_unfile_buffer(journal, jh);
+ jbd2_unfile_log_bh(bh);
/*
- * ->t_iobuf_list should contain only dummy buffer_heads
- * which were created by jbd2_journal_write_metadata_buffer().
+ * The list contains temporary buffer heads created by
+ * jbd2_journal_write_metadata_buffer().
*/
BUFFER_TRACE(bh, "dumping temporary bh");
- jbd2_journal_put_journal_head(jh);
__brelse(bh);
J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
free_buffer_head(bh);
- /* We also have to unlock and free the corresponding
- shadowed buffer */
+ /* We also have to refile the corresponding shadowed buffer */
jh = commit_transaction->t_shadow_list->b_tprev;
bh = jh2bh(jh);
- clear_bit(BH_JWrite, &bh->b_state);
+ clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh));
+ J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need
to remember it against this transaction so that when
@@ -763,14 +851,6 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /*
- * Wake up any transactions which were waiting for this IO to
- * complete. The barrier must be here so that changes by
- * jbd2_journal_file_buffer() take effect before wake_up_bit()
- * does the waitqueue check.
- */
- smp_mb();
- wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
}
@@ -780,26 +860,19 @@ wait_for_iobuf:
jbd_debug(3, "JBD2: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
- wait_for_ctlbuf:
- while (commit_transaction->t_log_list != NULL) {
+ while (!list_empty(&log_bufs)) {
struct buffer_head *bh;
- jh = commit_transaction->t_log_list->b_tprev;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- wait_on_buffer(bh);
- goto wait_for_ctlbuf;
- }
- if (cond_resched())
- goto wait_for_ctlbuf;
+ bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
+ wait_on_buffer(bh);
+ cond_resched();
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
- jbd2_journal_unfile_buffer(journal, jh);
- jbd2_journal_put_journal_head(jh);
+ jbd2_unfile_log_bh(bh);
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -825,12 +898,20 @@ wait_for_iobuf:
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
journal->j_flags & JBD2_BARRIER) {
- blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+ blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
}
if (err)
jbd2_journal_abort(journal, err);
+ /*
+ * Now disk caches for filesystem device are flushed so we are safe to
+ * erase checkpointed transactions from the log by updating journal
+ * superblock.
+ */
+ if (update_tail)
+ jbd2_update_log_tail(journal, first_tid, first_block);
+
/* End of a transaction! Finally, we can do checkpoint
processing: any buffers committed as a result of this
transaction can be removed from any checkpoint list it was on
@@ -841,9 +922,7 @@ wait_for_iobuf:
J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
- J_ASSERT(commit_transaction->t_iobuf_list == NULL);
J_ASSERT(commit_transaction->t_shadow_list == NULL);
- J_ASSERT(commit_transaction->t_log_list == NULL);
restart_loop:
/*
@@ -908,17 +987,35 @@ restart_loop:
* there's no point in keeping a checkpoint record for
* it. */
- /* A buffer which has been freed while still being
- * journaled by a previous transaction may end up still
- * being dirty here, but we want to avoid writing back
- * that buffer in the future after the "add to orphan"
- * operation been committed, That's not only a performance
- * gain, it also stops aliasing problems if the buffer is
- * left behind for writeback and gets reallocated for another
- * use in a different page. */
- if (buffer_freed(bh) && !jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ /*
+ * A buffer which has been freed while still being journaled by
+ * a previous transaction.
+ */
+ if (buffer_freed(bh)) {
+ /*
+ * If the running transaction is the one containing
+ * "add to orphan" operation (b_next_transaction !=
+ * NULL), we have to wait for that transaction to
+ * commit before we can really get rid of the buffer.
+ * So just clear b_modified to not confuse transaction
+ * credit accounting and refile the buffer to
+ * BJ_Forget of the running transaction. If the just
+ * committed transaction contains "add to orphan"
+ * operation, we can completely invalidate the buffer
+ * now. We are rather through in that since the
+ * buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial
+ * page.
+ */
+ jh->b_modified = 0;
+ if (!jh->b_next_transaction) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_req(bh);
+ bh->b_bdev = NULL;
+ }
}
if (buffer_jbddirty(bh)) {
@@ -968,6 +1065,25 @@ restart_loop:
goto restart_loop;
}
+ /* Add the transaction to the checkpoint list
+ * __journal_remove_checkpoint() can not destroy transaction
+ * under us because it is not marked as T_FINISHED yet */
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
+ } else {
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction;
+ }
+ spin_unlock(&journal->j_list_lock);
+
/* Done with this transaction! */
jbd_debug(3, "JBD2: commit phase 7\n");
@@ -986,23 +1102,9 @@ restart_loop:
atomic_read(&commit_transaction->t_handle_count);
trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
commit_transaction->t_tid, &stats.run);
+ stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
- /*
- * Calculate overall stats
- */
- spin_lock(&journal->j_history_lock);
- journal->j_stats.ts_tid++;
- journal->j_stats.run.rs_wait += stats.run.rs_wait;
- journal->j_stats.run.rs_running += stats.run.rs_running;
- journal->j_stats.run.rs_locked += stats.run.rs_locked;
- journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
- journal->j_stats.run.rs_logging += stats.run.rs_logging;
- journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
- journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
- journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
- spin_unlock(&journal->j_history_lock);
-
- commit_transaction->t_state = T_FINISHED;
+ commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
@@ -1017,29 +1119,8 @@ restart_loop:
journal->j_average_commit_time*3) / 4;
else
journal->j_average_commit_time = commit_time;
- write_unlock(&journal->j_state_lock);
- if (commit_transaction->t_checkpoint_list == NULL &&
- commit_transaction->t_checkpoint_io_list == NULL) {
- __jbd2_journal_drop_transaction(journal, commit_transaction);
- to_free = 1;
- } else {
- if (journal->j_checkpoint_transactions == NULL) {
- journal->j_checkpoint_transactions = commit_transaction;
- commit_transaction->t_cpnext = commit_transaction;
- commit_transaction->t_cpprev = commit_transaction;
- } else {
- commit_transaction->t_cpnext =
- journal->j_checkpoint_transactions;
- commit_transaction->t_cpprev =
- commit_transaction->t_cpnext->t_cpprev;
- commit_transaction->t_cpnext->t_cpprev =
- commit_transaction;
- commit_transaction->t_cpprev->t_cpnext =
- commit_transaction;
- }
- }
- spin_unlock(&journal->j_list_lock);
+ write_unlock(&journal->j_state_lock);
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
@@ -1047,8 +1128,34 @@ restart_loop:
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
- if (to_free)
- kfree(commit_transaction);
+ write_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ commit_transaction->t_state = T_FINISHED;
+ /* Check if the transaction can be dropped now that we are finished */
+ if (commit_transaction->t_checkpoint_list == NULL &&
+ commit_transaction->t_checkpoint_io_list == NULL) {
+ __jbd2_journal_drop_transaction(journal, commit_transaction);
+ jbd2_journal_free_transaction(commit_transaction);
+ }
+ spin_unlock(&journal->j_list_lock);
+ write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
+
+ /*
+ * Calculate overall stats
+ */
+ spin_lock(&journal->j_history_lock);
+ journal->j_stats.ts_tid++;
+ journal->j_stats.ts_requested += stats.ts_requested;
+ journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+ journal->j_stats.run.rs_running += stats.run.rs_running;
+ journal->j_stats.run.rs_locked += stats.run.rs_locked;
+ journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+ journal->j_stats.run.rs_logging += stats.run.rs_logging;
+ journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+ journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+ journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+ spin_unlock(&journal->j_history_lock);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index c0a5f9f1b12..67b8e303946 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -35,7 +35,6 @@
#include <linux/kthread.h>
#include <linux/poison.h>
#include <linux/proc_fs.h>
-#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/math64.h>
#include <linux/hash.h>
@@ -50,7 +49,14 @@
#include <asm/uaccess.h>
#include <asm/page.h>
-#include <asm/system.h>
+
+#ifdef CONFIG_JBD2_DEBUG
+ushort jbd2_journal_enable_debug __read_mostly;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+
+module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
+MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
+#endif
EXPORT_SYMBOL(jbd2_journal_extend);
EXPORT_SYMBOL(jbd2_journal_stop);
@@ -61,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
-EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
#if 0
EXPORT_SYMBOL(journal_sync_buffer);
@@ -71,7 +76,6 @@ EXPORT_SYMBOL(jbd2_journal_revoke);
EXPORT_SYMBOL(jbd2_journal_init_dev);
EXPORT_SYMBOL(jbd2_journal_init_inode);
-EXPORT_SYMBOL(jbd2_journal_update_format);
EXPORT_SYMBOL(jbd2_journal_check_used_features);
EXPORT_SYMBOL(jbd2_journal_check_available_features);
EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,10 +100,65 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
+#ifdef CONFIG_JBD2_DEBUG
+void __jbd2_debug(int level, const char *file, const char *func,
+ unsigned int line, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (level > jbd2_journal_enable_debug)
+ return;
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ va_end(args);
+}
+EXPORT_SYMBOL(__jbd2_debug);
+#endif
+
+/* Checksumming functions */
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
+}
+
+static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+{
+ __u32 csum;
+ __be32 old_csum;
+
+ old_csum = sb->s_checksum;
+ sb->s_checksum = 0;
+ csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+ sb->s_checksum = old_csum;
+
+ return cpu_to_be32(csum);
+}
+
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ return sb->s_checksum == jbd2_superblock_csum(j, sb);
+}
+
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+{
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ sb->s_checksum = jbd2_superblock_csum(j, sb);
+}
+
/*
* Helper function used to manage commit timeouts
*/
@@ -139,6 +198,8 @@ static int kjournald2(void *arg)
setup_timer(&journal->j_commit_timer, commit_timeout,
(unsigned long)current);
+ set_freezable();
+
/* Record that the journal thread is running */
journal->j_task = current;
wake_up(&journal->j_wait_done_commit);
@@ -241,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
journal->j_flags |= JBD2_UNMOUNT;
while (journal->j_task) {
- wake_up(&journal->j_wait_commit);
write_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_wait_commit);
wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
write_lock(&journal->j_state_lock);
}
@@ -268,14 +329,12 @@ static void journal_kill_thread(journal_t *journal)
*
* If the source buffer has already been modified by a new transaction
* since we took the last commit snapshot, we use the frozen copy of
- * that data for IO. If we end up using the existing buffer_head's data
- * for the write, then we *have* to lock the buffer to prevent anyone
- * else from using and possibly modifying it while the IO is in
- * progress.
+ * that data for IO. If we end up using the existing buffer_head's data
+ * for the write, then we have to make sure nobody modifies it while the
+ * IO is in progress. do_get_write_access() handles this.
*
- * The function returns a pointer to the buffer_heads to be used for IO.
- *
- * We assume that the journal has already been locked in this function.
+ * The function returns a pointer to the buffer_head to be used for IO.
+ *
*
* Return value:
* <0: Error
@@ -288,15 +347,14 @@ static void journal_kill_thread(journal_t *journal)
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
- struct journal_head **jh_out,
- unsigned long long blocknr)
+ struct buffer_head **bh_out,
+ sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct journal_head *new_jh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
@@ -325,17 +383,14 @@ retry_alloc:
}
/* keep subsequent assertions sane */
- new_bh->b_state = 0;
- init_buffer(new_bh, NULL, NULL);
atomic_set(&new_bh->b_count, 1);
- new_jh = jbd2_journal_add_journal_head(new_bh); /* This sleeps */
+ jbd_lock_bh_state(bh_in);
+repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
*/
- jbd_lock_bh_state(bh_in);
-repeat:
if (jh_in->b_frozen_data) {
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
@@ -345,7 +400,7 @@ repeat:
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
@@ -364,7 +419,7 @@ repeat:
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
/*
* Do we need to do a data copy?
@@ -375,7 +430,7 @@ repeat:
jbd_unlock_bh_state(bh_in);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
- jbd2_journal_put_journal_head(new_jh);
+ brelse(new_bh);
return -ENOMEM;
}
jbd_lock_bh_state(bh_in);
@@ -385,9 +440,9 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page, KM_USER0);
- memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
- kunmap_atomic(mapped_data, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
+ memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
+ kunmap_atomic(mapped_data);
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
@@ -406,20 +461,20 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page, KM_USER0);
+ mapped_data = kmap_atomic(new_page);
*((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data, KM_USER0);
+ kunmap_atomic(mapped_data);
}
set_bh_page(new_bh, new_page, new_offset);
- new_jh->b_transaction = NULL;
- new_bh->b_size = jh2bh(jh_in)->b_size;
- new_bh->b_bdev = transaction->t_journal->j_dev;
+ new_bh->b_size = bh_in->b_size;
+ new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
+ new_bh->b_private = bh_in;
set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh);
- *jh_out = new_jh;
+ *bh_out = new_bh;
/*
* The to-be-written buffer needs to get moved to the io queue,
@@ -430,11 +485,9 @@ repeat:
spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
+ set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in);
- JBUFFER_TRACE(new_jh, "file as BJ_IO");
- jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
-
return do_escape | (done_copy_out << 1);
}
@@ -444,40 +497,15 @@ repeat:
*/
/*
- * __jbd2_log_space_left: Return the number of free blocks left in the journal.
- *
- * Called with the journal already locked.
- *
- * Called under j_state_lock
- */
-
-int __jbd2_log_space_left(journal_t *journal)
-{
- int left = journal->j_free;
-
- /* assert_spin_locked(&journal->j_state_lock); */
-
- /*
- * Be pessimistic here about the number of those free blocks which
- * might be required for log descriptor control blocks.
- */
-
-#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
-
- left -= MIN_LOG_RESERVED_BLOCKS;
-
- if (left <= 0)
- return 0;
- left -= (left >> 3);
- return left;
-}
-
-/*
* Called with j_state_lock locked for writing.
* Returns true if a transaction commit was started.
*/
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
+ /* Return if the txn has already requested to be committed */
+ if (journal->j_commit_request == target)
+ return 0;
+
/*
* The only transaction we can possibly wait upon is the
* currently running transaction (if it exists). Otherwise,
@@ -494,6 +522,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
jbd_debug(1, "JBD2: requesting commit %d/%d\n",
journal->j_commit_request,
journal->j_commit_sequence);
+ journal->j_running_transaction->t_requested = jiffies;
wake_up(&journal->j_wait_commit);
return 1;
} else if (!tid_geq(journal->j_commit_request, target))
@@ -519,20 +548,17 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
}
/*
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
- *
- * We can only force the running transaction if we don't have an active handle;
- * otherwise, we will deadlock.
- *
- * Returns true if a transaction was started.
+ * Force and wait any uncommitted transactions. We can only force the running
+ * transaction if we don't have an active handle, otherwise, we will deadlock.
+ * Returns: <0 in case of error,
+ * 0 if nothing to commit,
+ * 1 if transaction was successfully committed.
*/
-int jbd2_journal_force_commit_nested(journal_t *journal)
+static int __jbd2_journal_force_commit(journal_t *journal)
{
transaction_t *transaction = NULL;
tid_t tid;
- int need_to_start = 0;
+ int need_to_start = 0, ret = 0;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction && !current->journal_info) {
@@ -543,16 +569,53 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
transaction = journal->j_committing_transaction;
if (!transaction) {
+ /* Nothing to commit */
read_unlock(&journal->j_state_lock);
- return 0; /* Nothing to retry */
+ return 0;
}
-
tid = transaction->t_tid;
read_unlock(&journal->j_state_lock);
if (need_to_start)
jbd2_log_start_commit(journal, tid);
- jbd2_log_wait_commit(journal, tid);
- return 1;
+ ret = jbd2_log_wait_commit(journal, tid);
+ if (!ret)
+ ret = 1;
+
+ return ret;
+}
+
+/**
+ * Force and wait upon a commit if the calling process is not within
+ * transaction. This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * @journal: journal to force
+ * Returns true if progress was made.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+ int ret;
+
+ ret = __jbd2_journal_force_commit(journal);
+ return ret > 0;
+}
+
+/**
+ * int journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * Caller want unconditional commit. We can only force the running transaction
+ * if we don't have an active handle, otherwise, we will deadlock.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+ int ret;
+
+ J_ASSERT(!current->journal_info);
+ ret = __jbd2_journal_force_commit(journal);
+ if (ret > 0)
+ ret = 0;
+ return ret;
}
/*
@@ -576,8 +639,8 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
ret = 1;
} else if (journal->j_committing_transaction) {
/*
- * If ext3_write_super() recently started a commit, then we
- * have to wait for completion of that transaction
+ * If commit has been started, then we have to wait for
+ * completion of that transaction.
*/
if (ptid)
*ptid = journal->j_committing_transaction->t_tid;
@@ -639,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
read_lock(&journal->j_state_lock);
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: error: j_commit_request=%d, tid=%d\n",
__func__, journal->j_commit_request, tid);
}
@@ -647,22 +710,51 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
while (tid_gt(tid, journal->j_commit_sequence)) {
jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence);
- wake_up(&journal->j_wait_commit);
read_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_wait_commit);
wait_event(journal->j_wait_done_commit,
!tid_gt(tid, journal->j_commit_sequence));
read_lock(&journal->j_state_lock);
}
read_unlock(&journal->j_state_lock);
- if (unlikely(is_journal_aborted(journal))) {
- printk(KERN_EMERG "journal commit I/O error\n");
+ if (unlikely(is_journal_aborted(journal)))
err = -EIO;
- }
return err;
}
/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed. If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete. If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+ int need_to_wait = 1;
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == tid) {
+ if (journal->j_commit_request != tid) {
+ /* transaction not yet started, so request it */
+ read_unlock(&journal->j_state_lock);
+ jbd2_log_start_commit(journal, tid);
+ goto wait_commit;
+ }
+ } else if (!(journal->j_committing_transaction &&
+ journal->j_committing_transaction->t_tid == tid))
+ need_to_wait = 0;
+ read_unlock(&journal->j_state_lock);
+ if (!need_to_wait)
+ return 0;
+wait_commit:
+ return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
+/*
* Log buffer allocation routines:
*/
@@ -722,7 +814,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
-struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
{
struct buffer_head *bh;
unsigned long long blocknr;
@@ -741,7 +833,99 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
set_buffer_uptodate(bh);
unlock_buffer(bh);
BUFFER_TRACE(bh, "return this buffer");
- return jbd2_journal_add_journal_head(bh);
+ return bh;
+}
+
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+ unsigned long *block)
+{
+ transaction_t *transaction;
+ int ret;
+
+ read_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
+ transaction = journal->j_checkpoint_transactions;
+ if (transaction) {
+ *tid = transaction->t_tid;
+ *block = transaction->t_log_start;
+ } else if ((transaction = journal->j_committing_transaction) != NULL) {
+ *tid = transaction->t_tid;
+ *block = transaction->t_log_start;
+ } else if ((transaction = journal->j_running_transaction) != NULL) {
+ *tid = transaction->t_tid;
+ *block = journal->j_head;
+ } else {
+ *tid = journal->j_transaction_sequence;
+ *block = journal->j_head;
+ }
+ ret = tid_gt(*tid, journal->j_tail_sequence);
+ spin_unlock(&journal->j_list_lock);
+ read_unlock(&journal->j_state_lock);
+
+ return ret;
+}
+
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+ unsigned long freed;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+
+ /*
+ * We cannot afford for write to remain in drive's caches since as
+ * soon as we update j_tail, next transaction can start reusing journal
+ * space and if we lose sb update during power failure we'd replay
+ * old transaction with possibly newly overwritten data.
+ */
+ jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+ write_lock(&journal->j_state_lock);
+ freed = block - journal->j_tail;
+ if (block < journal->j_tail)
+ freed += journal->j_last - journal->j_first;
+
+ trace_jbd2_update_log_tail(journal, tid, block, freed);
+ jbd_debug(1,
+ "Cleaning journal tail from %d to %d (offset %lu), "
+ "freeing %lu\n",
+ journal->j_tail_sequence, tid, block, freed);
+
+ journal->j_free += freed;
+ journal->j_tail_sequence = tid;
+ journal->j_tail = block;
+ write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+ mutex_lock(&journal->j_checkpoint_mutex);
+ if (tid_gt(tid, journal->j_tail_sequence))
+ __jbd2_update_log_tail(journal, tid, block);
+ mutex_unlock(&journal->j_checkpoint_mutex);
}
struct jbd2_stats_proc_session {
@@ -767,13 +951,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
- seq_printf(seq, "%lu transaction, each up to %u blocks\n",
- s->stats->ts_tid,
- s->journal->j_max_transaction_buffers);
+ seq_printf(seq, "%lu transactions (%lu requested), "
+ "each up to %u blocks\n",
+ s->stats->ts_tid, s->stats->ts_requested,
+ s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+ seq_printf(seq, " %ums request delay\n",
+ (s->stats->ts_requested == 0) ? 0 :
+ jiffies_to_msecs(s->stats->run.rs_request_delay /
+ s->stats->ts_requested));
seq_printf(seq, " %ums running transaction\n",
jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
@@ -806,7 +995,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
static int jbd2_seq_info_open(struct inode *inode, struct file *file)
{
- journal_t *journal = PDE(inode)->data;
+ journal_t *journal = PDE_DATA(inode);
struct jbd2_stats_proc_session *s;
int rc, size;
@@ -889,11 +1078,10 @@ static journal_t * journal_init_common (void)
return NULL;
init_waitqueue_head(&journal->j_wait_transaction_locked);
- init_waitqueue_head(&journal->j_wait_logspace);
init_waitqueue_head(&journal->j_wait_done_commit);
- init_waitqueue_head(&journal->j_wait_checkpoint);
init_waitqueue_head(&journal->j_wait_commit);
init_waitqueue_head(&journal->j_wait_updates);
+ init_waitqueue_head(&journal->j_wait_reserved);
mutex_init(&journal->j_barrier);
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
@@ -903,6 +1091,7 @@ static journal_t * journal_init_common (void)
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
+ atomic_set(&journal->j_reserved_credits, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1112,40 +1301,46 @@ static int journal_reset(journal_t *journal)
journal->j_max_transaction_buffers = journal->j_maxlen / 4;
- /* Add the dynamic fields and write it to disk. */
- jbd2_journal_update_superblock(journal, 1);
- return jbd2_journal_start_thread(journal);
-}
-
-/**
- * void jbd2_journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void jbd2_journal_update_superblock(journal_t *journal, int wait)
-{
- journal_superblock_t *sb = journal->j_superblock;
- struct buffer_head *bh = journal->j_sb_buffer;
-
/*
* As a special case, if the on-disk copy is already marked as needing
- * no recovery (s_start == 0) and there are no outstanding transactions
- * in the filesystem, then we can safely defer the superblock update
- * until the next commit by setting JBD2_FLUSHED. This avoids
+ * no recovery (s_start == 0), then we can safely defer the superblock
+ * update until the next commit by setting JBD2_FLUSHED. This avoids
* attempting a write to a potential-readonly device.
*/
- if (sb->s_start == 0 && journal->j_tail_sequence ==
- journal->j_transaction_sequence) {
+ if (sb->s_start == 0) {
jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
"(start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence,
journal->j_errno);
- goto out;
+ journal->j_flags |= JBD2_FLUSHED;
+ } else {
+ /* Lock here to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ /*
+ * Update log tail information. We use WRITE_FUA since new
+ * transaction will start reusing journal space and so we
+ * must make sure information about current log tail is on
+ * disk before that.
+ */
+ jbd2_journal_update_sb_log_tail(journal,
+ journal->j_tail_sequence,
+ journal->j_tail,
+ WRITE_FUA);
+ mutex_unlock(&journal->j_checkpoint_mutex);
}
+ return jbd2_journal_start_thread(journal);
+}
+
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ journal_superblock_t *sb = journal->j_superblock;
+ int ret;
+ trace_jbd2_write_superblock(journal, write_op);
+ if (!(journal->j_flags & JBD2_BARRIER))
+ write_op &= ~(REQ_FUA | REQ_FLUSH);
+ lock_buffer(bh);
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1161,48 +1356,113 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
clear_buffer_write_io_error(bh);
set_buffer_uptodate(bh);
}
+ jbd2_superblock_csum_set(journal, sb);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ ret = submit_bh(write_op, bh);
+ wait_on_buffer(bh);
+ if (buffer_write_io_error(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ ret = -EIO;
+ }
+ if (ret) {
+ printk(KERN_ERR "JBD2: Error %d detected when updating "
+ "journal superblock for %s.\n", ret,
+ journal->j_devname);
+ }
+}
+
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+ unsigned long tail_block, int write_op)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+ jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+ tail_block, tail_tid);
+ sb->s_sequence = cpu_to_be32(tail_tid);
+ sb->s_start = cpu_to_be32(tail_block);
+
+ jbd2_write_superblock(journal, write_op);
+
+ /* Log is no longer empty */
+ write_lock(&journal->j_state_lock);
+ WARN_ON(!sb->s_sequence);
+ journal->j_flags &= ~JBD2_FLUSHED;
+ write_unlock(&journal->j_state_lock);
+}
+
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
read_lock(&journal->j_state_lock);
- jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
- journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+ /* Is it already empty? */
+ if (sb->s_start == 0) {
+ read_unlock(&journal->j_state_lock);
+ return;
+ }
+ jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+ journal->j_tail_sequence);
sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
- sb->s_start = cpu_to_be32(journal->j_tail);
- sb->s_errno = cpu_to_be32(journal->j_errno);
+ sb->s_start = cpu_to_be32(0);
read_unlock(&journal->j_state_lock);
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_write_io_error(bh)) {
- printk(KERN_ERR "JBD2: I/O error detected "
- "when updating journal superblock for %s.\n",
- journal->j_devname);
- clear_buffer_write_io_error(bh);
- set_buffer_uptodate(bh);
- }
- } else
- write_dirty_buffer(bh, WRITE);
-
-out:
- /* If we have just flushed the log (by marking s_start==0), then
- * any future commit will have to be careful to update the
- * superblock again to re-record the true start of the log. */
+ jbd2_write_superblock(journal, WRITE_FUA);
+ /* Log is no longer empty */
write_lock(&journal->j_state_lock);
- if (sb->s_start)
- journal->j_flags &= ~JBD2_FLUSHED;
- else
- journal->j_flags |= JBD2_FLUSHED;
+ journal->j_flags |= JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
}
+
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno. Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+
+ read_lock(&journal->j_state_lock);
+ jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+ journal->j_errno);
+ sb->s_errno = cpu_to_be32(journal->j_errno);
+ read_unlock(&journal->j_state_lock);
+
+ jbd2_write_superblock(journal, WRITE_SYNC);
+}
+EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
+
/*
* Read the superblock for a given journal, performing initial
* validation of the format.
*/
-
static int journal_get_superblock(journal_t *journal)
{
struct buffer_head *bh;
@@ -1222,6 +1482,9 @@ static int journal_get_superblock(journal_t *journal)
}
}
+ if (buffer_verified(bh))
+ return 0;
+
sb = journal->j_superblock;
err = -EINVAL;
@@ -1259,6 +1522,43 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
+ if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ /* Can't have checksum v1 and v2 on at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
+ "at the same time!\n");
+ goto out;
+ }
+
+ if (!jbd2_verify_csum_type(journal, sb)) {
+ printk(KERN_ERR "JBD2: Unknown checksum type\n");
+ goto out;
+ }
+
+ /* Load the checksum driver */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ err = PTR_ERR(journal->j_chksum_driver);
+ journal->j_chksum_driver = NULL;
+ goto out;
+ }
+ }
+
+ /* Check superblock checksum */
+ if (!jbd2_superblock_csum_verify(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ goto out;
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+
+ set_buffer_verified(bh);
+
return 0;
out:
@@ -1396,14 +1696,11 @@ int jbd2_journal_destroy(journal_t *journal)
if (journal->j_sb_buffer) {
if (!is_journal_aborted(journal)) {
- /* We can now mark the journal as empty. */
- journal->j_tail = 0;
- journal->j_tail_sequence =
- ++journal->j_transaction_sequence;
- jbd2_journal_update_superblock(journal, 1);
- } else {
+ mutex_lock(&journal->j_checkpoint_mutex);
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ } else
err = -EIO;
- }
brelse(journal->j_sb_buffer);
}
@@ -1413,6 +1710,8 @@ int jbd2_journal_destroy(journal_t *journal)
iput(journal->j_inode);
if (journal->j_revoke)
jbd2_journal_destroy_revoke(journal);
+ if (journal->j_chksum_driver)
+ crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
kfree(journal);
@@ -1502,6 +1801,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
+#define INCOMPAT_FEATURE_ON(f) \
+ ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
+#define COMPAT_FEATURE_ON(f) \
+ ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
journal_superblock_t *sb;
if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
@@ -1510,16 +1813,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
return 0;
+ /* Asking for checksumming v2 and v1? Only give them v2. */
+ if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+ compat & JBD2_FEATURE_COMPAT_CHECKSUM)
+ compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
+
jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
compat, ro, incompat);
sb = journal->j_superblock;
+ /* If enabling v2 checksums, update superblock */
+ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+ sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
+ sb->s_feature_compat &=
+ ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+
+ /* Load the checksum driver */
+ if (journal->j_chksum_driver == NULL) {
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c",
+ 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c "
+ "driver.\n");
+ journal->j_chksum_driver = NULL;
+ return 0;
+ }
+ }
+
+ /* Precompute checksum seed for all metadata */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0,
+ sb->s_uuid,
+ sizeof(sb->s_uuid));
+ }
+
+ /* If enabling v1 checksums, downgrade superblock */
+ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
+ sb->s_feature_incompat &=
+ ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
return 1;
+#undef COMPAT_FEATURE_ON
+#undef INCOMPAT_FEATURE_ON
}
/*
@@ -1550,61 +1891,6 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int jbd2_journal_update_format (journal_t *journal)
-{
- journal_superblock_t *sb;
- int err;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- switch (be32_to_cpu(sb->s_header.h_blocktype)) {
- case JBD2_SUPERBLOCK_V2:
- return 0;
- case JBD2_SUPERBLOCK_V1:
- return journal_convert_superblock_v1(journal, sb);
- default:
- break;
- }
- return -EINVAL;
-}
-
-static int journal_convert_superblock_v1(journal_t *journal,
- journal_superblock_t *sb)
-{
- int offset, blocksize;
- struct buffer_head *bh;
-
- printk(KERN_WARNING
- "JBD2: Converting superblock from version 1 to 2.\n");
-
- /* Pre-initialise new fields to zero */
- offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
- blocksize = be32_to_cpu(sb->s_blocksize);
- memset(&sb->s_feature_compat, 0, blocksize-offset);
-
- sb->s_nr_users = cpu_to_be32(1);
- sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
- journal->j_format_version = 2;
-
- bh = journal->j_sb_buffer;
- BUFFER_TRACE(bh, "marking dirty");
- mark_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- return 0;
-}
-
-
-/**
* int jbd2_journal_flush () - Flush journal
* @journal: Journal to act on.
*
@@ -1617,7 +1903,6 @@ int jbd2_journal_flush(journal_t *journal)
{
int err = 0;
transaction_t *transaction = NULL;
- unsigned long old_tail;
write_lock(&journal->j_state_lock);
@@ -1652,6 +1937,7 @@ int jbd2_journal_flush(journal_t *journal)
if (is_journal_aborted(journal))
return -EIO;
+ mutex_lock(&journal->j_checkpoint_mutex);
jbd2_cleanup_journal_tail(journal);
/* Finally, mark the journal as really needing no recovery.
@@ -1659,14 +1945,9 @@ int jbd2_journal_flush(journal_t *journal)
* the magic code for a fully-recovered superblock. Any future
* commits of data to the journal will restore the current
* s_start value. */
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
write_lock(&journal->j_state_lock);
- old_tail = journal->j_tail;
- journal->j_tail = 0;
- write_unlock(&journal->j_state_lock);
- jbd2_journal_update_superblock(journal, 1);
- write_lock(&journal->j_state_lock);
- journal->j_tail = old_tail;
-
J_ASSERT(!journal->j_running_transaction);
J_ASSERT(!journal->j_committing_transaction);
J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1706,8 +1987,12 @@ int jbd2_journal_wipe(journal_t *journal, int write)
write ? "Clearing" : "Ignoring");
err = jbd2_journal_skip_recovery(journal);
- if (write)
- jbd2_journal_update_superblock(journal, 1);
+ if (write) {
+ /* Lock to make assertions happy... */
+ mutex_lock(&journal->j_checkpoint_mutex);
+ jbd2_mark_journal_empty(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
+ }
no_recovery:
return err;
@@ -1757,7 +2042,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
if (errno)
- jbd2_journal_update_superblock(journal, 1);
+ jbd2_journal_update_sb_errno(journal);
}
/**
@@ -1880,10 +2165,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
*/
size_t journal_tag_bytes(journal_t *journal)
{
+ journal_block_tag_t tag;
+ size_t x = 0;
+
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ x += sizeof(tag.t_checksum);
+
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
- return JBD2_TAG_SIZE64;
+ return x + JBD2_TAG_SIZE64;
else
- return JBD2_TAG_SIZE32;
+ return x + JBD2_TAG_SIZE32;
}
/*
@@ -2015,7 +2306,7 @@ static struct kmem_cache *jbd2_journal_head_cache;
static atomic_t nr_journal_heads = ATOMIC_INIT(0);
#endif
-static int journal_init_jbd2_journal_head_cache(void)
+static int jbd2_journal_init_journal_head_cache(void)
{
int retval;
@@ -2033,7 +2324,7 @@ static int journal_init_jbd2_journal_head_cache(void)
return retval;
}
-static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+static void jbd2_journal_destroy_journal_head_cache(void)
{
if (jbd2_journal_head_cache) {
kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2051,13 +2342,13 @@ static struct journal_head *journal_alloc_journal_head(void)
#ifdef CONFIG_JBD2_DEBUG
atomic_inc(&nr_journal_heads);
#endif
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
if (!ret) {
jbd_debug(1, "out of memory for journal_head\n");
pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
while (!ret) {
yield();
- ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+ ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
}
}
return ret;
@@ -2119,10 +2410,8 @@ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
struct journal_head *new_jh = NULL;
repeat:
- if (!buffer_jbd(bh)) {
+ if (!buffer_jbd(bh))
new_jh = journal_alloc_journal_head();
- memset(new_jh, 0, sizeof(*new_jh));
- }
jbd_lock_bh_journal_head(bh);
if (buffer_jbd(bh)) {
@@ -2257,45 +2546,6 @@ restart:
spin_unlock(&journal->j_list_lock);
}
-/*
- * debugfs tunables
- */
-#ifdef CONFIG_JBD2_DEBUG
-u8 jbd2_journal_enable_debug __read_mostly;
-EXPORT_SYMBOL(jbd2_journal_enable_debug);
-
-#define JBD2_DEBUG_NAME "jbd2-debug"
-
-static struct dentry *jbd2_debugfs_dir;
-static struct dentry *jbd2_debug;
-
-static void __init jbd2_create_debugfs_entry(void)
-{
- jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
- if (jbd2_debugfs_dir)
- jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
- S_IRUGO | S_IWUSR,
- jbd2_debugfs_dir,
- &jbd2_journal_enable_debug);
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
- debugfs_remove(jbd2_debug);
- debugfs_remove(jbd2_debugfs_dir);
-}
-
-#else
-
-static void __init jbd2_create_debugfs_entry(void)
-{
-}
-
-static void __exit jbd2_remove_debugfs_entry(void)
-{
-}
-
-#endif
#ifdef CONFIG_PROC_FS
@@ -2321,7 +2571,7 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
-static int __init journal_init_handle_cache(void)
+static int __init jbd2_journal_init_handle_cache(void)
{
jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
if (jbd2_handle_cache == NULL) {
@@ -2356,17 +2606,20 @@ static int __init journal_init_caches(void)
ret = jbd2_journal_init_revoke_caches();
if (ret == 0)
- ret = journal_init_jbd2_journal_head_cache();
+ ret = jbd2_journal_init_journal_head_cache();
+ if (ret == 0)
+ ret = jbd2_journal_init_handle_cache();
if (ret == 0)
- ret = journal_init_handle_cache();
+ ret = jbd2_journal_init_transaction_cache();
return ret;
}
static void jbd2_journal_destroy_caches(void)
{
jbd2_journal_destroy_revoke_caches();
- jbd2_journal_destroy_jbd2_journal_head_cache();
+ jbd2_journal_destroy_journal_head_cache();
jbd2_journal_destroy_handle_cache();
+ jbd2_journal_destroy_transaction_cache();
jbd2_journal_destroy_slabs();
}
@@ -2378,7 +2631,6 @@ static int __init journal_init(void)
ret = journal_init_caches();
if (ret == 0) {
- jbd2_create_debugfs_entry();
jbd2_create_jbd_stats_proc_entry();
} else {
jbd2_journal_destroy_caches();
@@ -2391,9 +2643,8 @@ static void __exit journal_exit(void)
#ifdef CONFIG_JBD2_DEBUG
int n = atomic_read(&nr_journal_heads);
if (n)
- printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
+ printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
#endif
- jbd2_remove_debugfs_entry();
jbd2_remove_jbd_stats_proc_entry();
jbd2_journal_destroy_caches();
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7baf139..3b6bb19d60b 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -21,6 +21,7 @@
#include <linux/jbd2.h>
#include <linux/errno.h>
#include <linux/crc32.h>
+#include <linux/blkdev.h>
#endif
/*
@@ -173,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
return 0;
}
+static int jbd2_descr_block_csum_verify(journal_t *j,
+ void *buf)
+{
+ struct jbd2_journal_block_tail *tail;
+ __be32 provided;
+ __u32 calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
+ sizeof(struct jbd2_journal_block_tail));
+ provided = tail->t_checksum;
+ tail->t_checksum = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ tail->t_checksum = provided;
+
+ return provided == cpu_to_be32(calculated);
+}
/*
* Count the number of in-use tags in a journal descriptor block.
@@ -185,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ size -= sizeof(struct jbd2_journal_block_tail);
+
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes) <= size) {
@@ -192,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
nr++;
tagp += tag_bytes;
- if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+ if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
tagp += 16;
- if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+ if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
break;
}
@@ -265,7 +288,12 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
-
+ /* Make sure all replayed data is on permanent storage */
+ if (journal->j_flags & JBD2_BARRIER) {
+ err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+ if (!err)
+ err = err2;
+ }
return err;
}
@@ -350,6 +378,40 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
return 0;
}
+static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
+{
+ struct commit_header *h;
+ __be32 provided;
+ __u32 calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ h = buf;
+ provided = h->h_chksum[0];
+ h->h_chksum[0] = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ h->h_chksum[0] = provided;
+
+ return provided == cpu_to_be32(calculated);
+}
+
+static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+ void *buf, __u32 sequence)
+{
+ __u32 csum32;
+ __be32 seq;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ seq = cpu_to_be32(sequence);
+ csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
+
+ return tag->t_checksum == cpu_to_be16(csum32);
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -363,6 +425,7 @@ static int do_one_pass(journal_t *journal,
int blocktype;
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
+ int descr_csum_size = 0;
/*
* First thing is to establish what we expect to find in the log
@@ -448,6 +511,18 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
+ /* Verify checksum first */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ descr_csum_size =
+ sizeof(struct jbd2_journal_block_tail);
+ if (descr_csum_size > 0 &&
+ !jbd2_descr_block_csum_verify(journal,
+ bh->b_data)) {
+ err = -EIO;
+ goto failed;
+ }
+
/* If it is a valid descriptor block, replay it
* in pass REPLAY; if journal_checksums enabled, then
* calculate checksums in PASS_SCAN, otherwise,
@@ -478,11 +553,11 @@ static int do_one_pass(journal_t *journal,
tagp = &bh->b_data[sizeof(journal_header_t)];
while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize) {
+ <= journal->j_blocksize - descr_csum_size) {
unsigned long io_block;
tag = (journal_block_tag_t *) tagp;
- flags = be32_to_cpu(tag->t_flags);
+ flags = be16_to_cpu(tag->t_flags);
io_block = next_log_block++;
wrap(journal, next_log_block);
@@ -513,6 +588,19 @@ static int do_one_pass(journal_t *journal,
goto skip_write;
}
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(
+ journal, tag, obh->b_data,
+ be32_to_cpu(tmp->h_sequence))) {
+ brelse(obh);
+ success = -EIO;
+ printk(KERN_ERR "JBD2: Invalid "
+ "checksum recovering "
+ "block %llu in log\n",
+ blocknr);
+ continue;
+ }
+
/* Find a buffer for the new
* data being restored */
nbh = __getblk(journal->j_fs_dev,
@@ -647,6 +735,19 @@ static int do_one_pass(journal_t *journal,
}
crc32_sum = ~0;
}
+ if (pass == PASS_SCAN &&
+ !jbd2_commit_block_csum_verify(journal,
+ bh->b_data)) {
+ info->end_transaction = next_commit_ID;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ journal->j_failed_commit =
+ next_commit_ID;
+ brelse(bh);
+ break;
+ }
+ }
brelse(bh);
next_commit_ID++;
continue;
@@ -703,6 +804,25 @@ static int do_one_pass(journal_t *journal,
return err;
}
+static int jbd2_revoke_block_csum_verify(journal_t *j,
+ void *buf)
+{
+ struct jbd2_journal_revoke_tail *tail;
+ __be32 provided;
+ __u32 calculated;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return 1;
+
+ tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+ sizeof(struct jbd2_journal_revoke_tail));
+ provided = tail->r_checksum;
+ tail->r_checksum = 0;
+ calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ tail->r_checksum = provided;
+
+ return provided == cpu_to_be32(calculated);
+}
/* Scan a revoke record, marking all blocks mentioned as revoked. */
@@ -717,6 +837,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
offset = sizeof(jbd2_journal_revoke_header_t);
max = be32_to_cpu(header->r_count);
+ if (!jbd2_revoke_block_csum_verify(journal, header))
+ return -EINVAL;
+
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
record_len = 8;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867d6cc..198c9c10276 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,9 +122,10 @@ struct jbd2_revoke_table_s
#ifdef __KERNEL__
static void write_one_revoke_record(journal_t *, transaction_t *,
- struct journal_head **, int *,
+ struct list_head *,
+ struct buffer_head **, int *,
struct jbd2_revoke_record_s *, int);
-static void flush_descriptor(journal_t *, struct journal_head *, int, int);
+static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
#endif
/* Utility functions to maintain the revoke table */
@@ -208,17 +209,13 @@ int __init jbd2_journal_init_revoke_caches(void)
J_ASSERT(!jbd2_revoke_record_cache);
J_ASSERT(!jbd2_revoke_table_cache);
- jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
- sizeof(struct jbd2_revoke_record_s),
- 0,
- SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
- NULL);
+ jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
+ SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
if (!jbd2_revoke_record_cache)
goto record_cache_failure;
- jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
- sizeof(struct jbd2_revoke_table_s),
- 0, SLAB_TEMPORARY, NULL);
+ jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
+ SLAB_TEMPORARY);
if (!jbd2_revoke_table_cache)
goto table_cache_failure;
return 0;
@@ -535,9 +532,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
*/
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction,
+ struct list_head *log_bufs,
int write_op)
{
- struct journal_head *descriptor;
+ struct buffer_head *descriptor;
struct jbd2_revoke_record_s *record;
struct jbd2_revoke_table_s *revoke;
struct list_head *hash_list;
@@ -557,7 +555,7 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
while (!list_empty(hash_list)) {
record = (struct jbd2_revoke_record_s *)
hash_list->next;
- write_one_revoke_record(journal, transaction,
+ write_one_revoke_record(journal, transaction, log_bufs,
&descriptor, &offset,
record, write_op);
count++;
@@ -577,12 +575,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
static void write_one_revoke_record(journal_t *journal,
transaction_t *transaction,
- struct journal_head **descriptorp,
+ struct list_head *log_bufs,
+ struct buffer_head **descriptorp,
int *offsetp,
struct jbd2_revoke_record_s *record,
int write_op)
{
- struct journal_head *descriptor;
+ int csum_size = 0;
+ struct buffer_head *descriptor;
int offset;
journal_header_t *header;
@@ -596,9 +596,13 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = *descriptorp;
offset = *offsetp;
+ /* Do we need to leave space at the end for a checksum? */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ csum_size = sizeof(struct jbd2_journal_revoke_tail);
+
/* Make sure we have a descriptor with space left for the record */
if (descriptor) {
- if (offset == journal->j_blocksize) {
+ if (offset >= journal->j_blocksize - csum_size) {
flush_descriptor(journal, descriptor, offset, write_op);
descriptor = NULL;
}
@@ -608,26 +612,26 @@ static void write_one_revoke_record(journal_t *journal,
descriptor = jbd2_journal_get_descriptor_buffer(journal);
if (!descriptor)
return;
- header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ header = (journal_header_t *)descriptor->b_data;
header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
header->h_sequence = cpu_to_be32(transaction->t_tid);
/* Record it so that we can wait for IO completion later */
- JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
- jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ BUFFER_TRACE(descriptor, "file in log_bufs");
+ jbd2_file_log_bh(log_bufs, descriptor);
offset = sizeof(jbd2_journal_revoke_header_t);
*descriptorp = descriptor;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
- * ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
offset += 8;
} else {
- * ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+ * ((__be32 *)(&descriptor->b_data[offset])) =
cpu_to_be32(record->blocknr);
offset += 4;
}
@@ -635,6 +639,21 @@ static void write_one_revoke_record(journal_t *journal,
*offsetp = offset;
}
+static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
+{
+ struct jbd2_journal_revoke_tail *tail;
+ __u32 csum;
+
+ if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ return;
+
+ tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
+ sizeof(struct jbd2_journal_revoke_tail));
+ tail->r_checksum = 0;
+ csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ tail->r_checksum = cpu_to_be32(csum);
+}
+
/*
* Flush a revoke descriptor out to the journal. If we are aborting,
* this is a noop; otherwise we are generating a buffer which needs to
@@ -643,23 +662,24 @@ static void write_one_revoke_record(journal_t *journal,
*/
static void flush_descriptor(journal_t *journal,
- struct journal_head *descriptor,
+ struct buffer_head *descriptor,
int offset, int write_op)
{
jbd2_journal_revoke_header_t *header;
- struct buffer_head *bh = jh2bh(descriptor);
if (is_journal_aborted(journal)) {
- put_bh(bh);
+ put_bh(descriptor);
return;
}
- header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
header->r_count = cpu_to_be32(offset);
- set_buffer_jwrite(bh);
- BUFFER_TRACE(bh, "write");
- set_buffer_dirty(bh);
- write_dirty_buffer(bh, write_op);
+ jbd2_revoke_csum_set(journal, descriptor);
+
+ set_buffer_jwrite(descriptor);
+ BUFFER_TRACE(descriptor, "write");
+ set_buffer_dirty(descriptor);
+ write_dirty_buffer(descriptor, write_op);
}
#endif
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 35ae096bed5..6f0f590cc5a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -30,9 +30,40 @@
#include <linux/bug.h>
#include <linux/module.h>
+#include <trace/events/jbd2.h>
+
static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+ J_ASSERT(!transaction_cache);
+ transaction_cache = kmem_cache_create("jbd2_transaction_s",
+ sizeof(transaction_t),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+ NULL);
+ if (transaction_cache)
+ return 0;
+ return -ENOMEM;
+}
+
+void jbd2_journal_destroy_transaction_cache(void)
+{
+ if (transaction_cache) {
+ kmem_cache_destroy(transaction_cache);
+ transaction_cache = NULL;
+ }
+}
+
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+ if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+ return;
+ kmem_cache_free(transaction_cache, transaction);
+}
+
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
@@ -58,7 +89,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
- atomic_set(&transaction->t_outstanding_credits, 0);
+ atomic_set(&transaction->t_outstanding_credits,
+ atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
@@ -71,6 +103,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
journal->j_running_transaction = transaction;
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
+ transaction->t_requested = 0;
return transaction;
}
@@ -109,6 +142,112 @@ static inline void update_t_max_wait(transaction_t *transaction,
}
/*
+ * Wait until running transaction passes T_LOCKED state. Also starts the commit
+ * if needed. The function expects running transaction to exist and releases
+ * j_state_lock.
+ */
+static void wait_transaction_locked(journal_t *journal)
+ __releases(journal->j_state_lock)
+{
+ DEFINE_WAIT(wait);
+ int need_to_start;
+ tid_t tid = journal->j_running_transaction->t_tid;
+
+ prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ TASK_UNINTERRUPTIBLE);
+ need_to_start = !tid_geq(journal->j_commit_request, tid);
+ read_unlock(&journal->j_state_lock);
+ if (need_to_start)
+ jbd2_log_start_commit(journal, tid);
+ schedule();
+ finish_wait(&journal->j_wait_transaction_locked, &wait);
+}
+
+static void sub_reserved_credits(journal_t *journal, int blocks)
+{
+ atomic_sub(blocks, &journal->j_reserved_credits);
+ wake_up(&journal->j_wait_reserved);
+}
+
+/*
+ * Wait until we can add credits for handle to the running transaction. Called
+ * with j_state_lock held for reading. Returns 0 if handle joined the running
+ * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
+ * caller must retry.
+ */
+static int add_transaction_credits(journal_t *journal, int blocks,
+ int rsv_blocks)
+{
+ transaction_t *t = journal->j_running_transaction;
+ int needed;
+ int total = blocks + rsv_blocks;
+
+ /*
+ * If the current transaction is locked down for commit, wait
+ * for the lock to be released.
+ */
+ if (t->t_state == T_LOCKED) {
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * If there is not enough space left in the log to write all
+ * potential buffers requested by this operation, we need to
+ * stall pending a log checkpoint to free some more log space.
+ */
+ needed = atomic_add_return(total, &t->t_outstanding_credits);
+ if (needed > journal->j_max_transaction_buffers) {
+ /*
+ * If the current transaction is already too large,
+ * then start to commit it: we can then go back and
+ * attach this handle to a new transaction.
+ */
+ atomic_sub(total, &t->t_outstanding_credits);
+ wait_transaction_locked(journal);
+ return 1;
+ }
+
+ /*
+ * The commit code assumes that it can get enough log space
+ * without forcing a checkpoint. This is *critical* for
+ * correctness: a checkpoint of a buffer which is also
+ * associated with a committing transaction creates a deadlock,
+ * so commit simply cannot force through checkpoints.
+ *
+ * We must therefore ensure the necessary space in the journal
+ * *before* starting to dirty potentially checkpointed buffers
+ * in the new transaction.
+ */
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal)) {
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ write_lock(&journal->j_state_lock);
+ if (jbd2_log_space_left(journal) < jbd2_space_needed(journal))
+ __jbd2_log_wait_for_space(journal);
+ write_unlock(&journal->j_state_lock);
+ return 1;
+ }
+
+ /* No reservation? We are done... */
+ if (!rsv_blocks)
+ return 0;
+
+ needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
+ /* We allow at most half of a transaction to be reserved */
+ if (needed > journal->j_max_transaction_buffers / 2) {
+ sub_reserved_credits(journal, rsv_blocks);
+ atomic_sub(total, &t->t_outstanding_credits);
+ read_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) + rsv_blocks
+ <= journal->j_max_transaction_buffers / 2);
+ return 1;
+ }
+ return 0;
+}
+
+/*
* start_this_handle: Given a handle, deal with any locking or stalling
* needed to make sure that there is enough journal space for the handle
* to begin. Attach the handle to a transaction and set up the
@@ -119,21 +258,28 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
gfp_t gfp_mask)
{
transaction_t *transaction, *new_transaction = NULL;
- tid_t tid;
- int needed, need_to_start;
- int nblocks = handle->h_buffer_credits;
+ int blocks = handle->h_buffer_credits;
+ int rsv_blocks = 0;
unsigned long ts = jiffies;
- if (nblocks > journal->j_max_transaction_buffers) {
+ /*
+ * 1/2 of transaction can be reserved so we can practically handle
+ * only 1/2 of maximum transaction size per operation
+ */
+ if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
- current->comm, nblocks,
- journal->j_max_transaction_buffers);
+ current->comm, blocks,
+ journal->j_max_transaction_buffers / 2);
return -ENOSPC;
}
+ if (handle->h_rsv_handle)
+ rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+ new_transaction = kmem_cache_zalloc(transaction_cache,
+ gfp_mask);
if (!new_transaction) {
/*
* If __GFP_FS is not present, then we may be
@@ -162,12 +308,16 @@ repeat:
if (is_journal_aborted(journal) ||
(journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
read_unlock(&journal->j_state_lock);
- kfree(new_transaction);
+ jbd2_journal_free_transaction(new_transaction);
return -EROFS;
}
- /* Wait on the journal's transaction barrier if necessary */
- if (journal->j_barrier_count) {
+ /*
+ * Wait on the journal's transaction barrier if necessary. Specifically
+ * we allow reserved handles to proceed because otherwise commit could
+ * deadlock on page writeback not being able to complete.
+ */
+ if (!handle->h_reserved && journal->j_barrier_count) {
read_unlock(&journal->j_state_lock);
wait_event(journal->j_wait_transaction_locked,
journal->j_barrier_count == 0);
@@ -179,7 +329,8 @@ repeat:
if (!new_transaction)
goto alloc_transaction;
write_lock(&journal->j_state_lock);
- if (!journal->j_running_transaction) {
+ if (!journal->j_running_transaction &&
+ (handle->h_reserved || !journal->j_barrier_count)) {
jbd2_get_transaction(journal, new_transaction);
new_transaction = NULL;
}
@@ -189,85 +340,18 @@ repeat:
transaction = journal->j_running_transaction;
- /*
- * If the current transaction is locked down for commit, wait for the
- * lock to be released.
- */
- if (transaction->t_state == T_LOCKED) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&journal->j_wait_transaction_locked,
- &wait, TASK_UNINTERRUPTIBLE);
- read_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * If there is not enough space left in the log to write all potential
- * buffers requested by this operation, we need to stall pending a log
- * checkpoint to free some more log space.
- */
- needed = atomic_add_return(nblocks,
- &transaction->t_outstanding_credits);
-
- if (needed > journal->j_max_transaction_buffers) {
+ if (!handle->h_reserved) {
+ /* We may have dropped j_state_lock - restart in that case */
+ if (add_transaction_credits(journal, blocks, rsv_blocks))
+ goto repeat;
+ } else {
/*
- * If the current transaction is already too large, then start
- * to commit it: we can then go back and attach this handle to
- * a new transaction.
+ * We have handle reserved so we are allowed to join T_LOCKED
+ * transaction and we don't have to check for transaction size
+ * and journal space.
*/
- DEFINE_WAIT(wait);
-
- jbd_debug(2, "Handle %p starting new commit...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
- TASK_UNINTERRUPTIBLE);
- tid = transaction->t_tid;
- need_to_start = !tid_geq(journal->j_commit_request, tid);
- read_unlock(&journal->j_state_lock);
- if (need_to_start)
- jbd2_log_start_commit(journal, tid);
- schedule();
- finish_wait(&journal->j_wait_transaction_locked, &wait);
- goto repeat;
- }
-
- /*
- * The commit code assumes that it can get enough log space
- * without forcing a checkpoint. This is *critical* for
- * correctness: a checkpoint of a buffer which is also
- * associated with a committing transaction creates a deadlock,
- * so commit simply cannot force through checkpoints.
- *
- * We must therefore ensure the necessary space in the journal
- * *before* starting to dirty potentially checkpointed buffers
- * in the new transaction.
- *
- * The worst part is, any transaction currently committing can
- * reduce the free space arbitrarily. Be careful to account for
- * those buffers when checkpointing.
- */
-
- /*
- * @@@ AKPM: This seems rather over-defensive. We're giving commit
- * a _lot_ of headroom: 1/4 of the journal plus the size of
- * the committing transaction. Really, we only need to give it
- * committing_transaction->t_outstanding_credits plus "enough" for
- * the log control blocks.
- * Also, this test is inconsistent with the matching one in
- * jbd2_journal_extend().
- */
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
- jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
- atomic_sub(nblocks, &transaction->t_outstanding_credits);
- read_unlock(&journal->j_state_lock);
- write_lock(&journal->j_state_lock);
- if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
- __jbd2_log_wait_for_space(journal);
- write_unlock(&journal->j_state_lock);
- goto repeat;
+ sub_reserved_credits(journal, blocks);
+ handle->h_reserved = 0;
}
/* OK, account for the buffers that this operation expects to
@@ -275,16 +359,19 @@ repeat:
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
+ handle->h_requested_credits = blocks;
+ handle->h_start_jiffies = jiffies;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
- jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
- handle, nblocks,
+ jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
+ handle, blocks,
atomic_read(&transaction->t_outstanding_credits),
- __jbd2_log_space_left(journal));
+ jbd2_log_space_left(journal));
read_unlock(&journal->j_state_lock);
+ current->journal_info = handle;
lock_map_acquire(&handle->h_lockdep_map);
- kfree(new_transaction);
+ jbd2_journal_free_transaction(new_transaction);
return 0;
}
@@ -296,7 +383,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
- memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
@@ -313,15 +399,21 @@ static handle_t *new_handle(int nblocks)
*
* We make sure that the transaction can guarantee at least nblocks of
* modified buffers in the log. We block until the log can guarantee
- * that much space.
- *
- * This function is visible to journal users (like ext3fs), so is not
- * called with the journal already locked.
+ * that much space. Additionally, if rsv_blocks > 0, we also create another
+ * handle with rsv_blocks reserved blocks in the journal. This handle is
+ * is stored in h_rsv_handle. It is not attached to any particular transaction
+ * and thus doesn't block transaction commit. If the caller uses this reserved
+ * handle, it has to set h_rsv_handle to NULL as otherwise jbd2_journal_stop()
+ * on the parent handle will dispose the reserved one. Reserved handle has to
+ * be converted to a normal handle using jbd2_journal_start_reserved() before
+ * it can be used.
*
* Return a pointer to a newly allocated handle, or an ERR_PTR() value
* on failure.
*/
-handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
+handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
+ gfp_t gfp_mask, unsigned int type,
+ unsigned int line_no)
{
handle_t *handle = journal_current_handle();
int err;
@@ -338,15 +430,31 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask)
handle = new_handle(nblocks);
if (!handle)
return ERR_PTR(-ENOMEM);
+ if (rsv_blocks) {
+ handle_t *rsv_handle;
- current->journal_info = handle;
+ rsv_handle = new_handle(rsv_blocks);
+ if (!rsv_handle) {
+ jbd2_free_handle(handle);
+ return ERR_PTR(-ENOMEM);
+ }
+ rsv_handle->h_reserved = 1;
+ rsv_handle->h_journal = journal;
+ handle->h_rsv_handle = rsv_handle;
+ }
err = start_this_handle(journal, handle, gfp_mask);
if (err < 0) {
+ if (handle->h_rsv_handle)
+ jbd2_free_handle(handle->h_rsv_handle);
jbd2_free_handle(handle);
- current->journal_info = NULL;
- handle = ERR_PTR(err);
+ return ERR_PTR(err);
}
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
+ handle->h_transaction->t_tid, type,
+ line_no, nblocks);
return handle;
}
EXPORT_SYMBOL(jbd2__journal_start);
@@ -354,10 +462,67 @@ EXPORT_SYMBOL(jbd2__journal_start);
handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
{
- return jbd2__journal_start(journal, nblocks, GFP_NOFS);
+ return jbd2__journal_start(journal, nblocks, 0, GFP_NOFS, 0, 0);
}
EXPORT_SYMBOL(jbd2_journal_start);
+void jbd2_journal_free_reserved(handle_t *handle)
+{
+ journal_t *journal = handle->h_journal;
+
+ WARN_ON(!handle->h_reserved);
+ sub_reserved_credits(journal, handle->h_buffer_credits);
+ jbd2_free_handle(handle);
+}
+EXPORT_SYMBOL(jbd2_journal_free_reserved);
+
+/**
+ * int jbd2_journal_start_reserved(handle_t *handle) - start reserved handle
+ * @handle: handle to start
+ *
+ * Start handle that has been previously reserved with jbd2_journal_reserve().
+ * This attaches @handle to the running transaction (or creates one if there's
+ * not transaction running). Unlike jbd2_journal_start() this function cannot
+ * block on journal commit, checkpointing, or similar stuff. It can block on
+ * memory allocation or frozen journal though.
+ *
+ * Return 0 on success, non-zero on error - handle is freed in that case.
+ */
+int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
+ unsigned int line_no)
+{
+ journal_t *journal = handle->h_journal;
+ int ret = -EIO;
+
+ if (WARN_ON(!handle->h_reserved)) {
+ /* Someone passed in normal handle? Just stop it. */
+ jbd2_journal_stop(handle);
+ return ret;
+ }
+ /*
+ * Usefulness of mixing of reserved and unreserved handles is
+ * questionable. So far nobody seems to need it so just error out.
+ */
+ if (WARN_ON(current->journal_info)) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+
+ handle->h_journal = NULL;
+ /*
+ * GFP_NOFS is here because callers are likely from writeback or
+ * similarly constrained call sites
+ */
+ ret = start_this_handle(journal, handle, GFP_NOFS);
+ if (ret < 0) {
+ jbd2_journal_free_reserved(handle);
+ return ret;
+ }
+ handle->h_type = type;
+ handle->h_line_no = line_no;
+ return 0;
+}
+EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
* int jbd2_journal_extend() - extend buffer credits.
@@ -382,42 +547,53 @@ EXPORT_SYMBOL(jbd2_journal_start);
int jbd2_journal_extend(handle_t *handle, int nblocks)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
int result;
int wanted;
- result = -EIO;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
+ return -EROFS;
+ journal = transaction->t_journal;
result = 1;
read_lock(&journal->j_state_lock);
/* Don't extend a locked-down transaction! */
- if (handle->h_transaction->t_state != T_RUNNING) {
+ if (transaction->t_state != T_RUNNING) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction not running\n", handle, nblocks);
goto error_out;
}
spin_lock(&transaction->t_handle_lock);
- wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
+ wanted = atomic_add_return(nblocks,
+ &transaction->t_outstanding_credits);
if (wanted > journal->j_max_transaction_buffers) {
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
- if (wanted > __jbd2_log_space_left(journal)) {
+ if (wanted + (wanted >> JBD2_CONTROL_BLOCKS_SHIFT) >
+ jbd2_log_space_left(journal)) {
jbd_debug(3, "denied handle %p %d blocks: "
"insufficient log space\n", handle, nblocks);
+ atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock;
}
+ trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
+ transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ handle->h_buffer_credits,
+ nblocks);
+
handle->h_buffer_credits += nblocks;
- atomic_add(nblocks, &transaction->t_outstanding_credits);
+ handle->h_requested_credits += nblocks;
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
@@ -425,7 +601,6 @@ unlock:
spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
-out:
return result;
}
@@ -442,19 +617,22 @@ out:
* to a running handle, a call to jbd2_journal_restart will commit the
* handle's transaction so far and reattach the handle to a new
* transaction capabable of guaranteeing the requested number of
- * credits.
+ * credits. We preserve reserved handle if there's any attached to the
+ * passed in handle.
*/
int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
tid_t tid;
int need_to_start, ret;
+ WARN_ON(!transaction);
/* If we've had an abort of any type, don't even think about
* actually doing the restart! */
if (is_handle_aborted(handle))
return 0;
+ journal = transaction->t_journal;
/*
* First unlink the handle from its current transaction, and start the
@@ -467,12 +645,18 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask)
spin_lock(&transaction->t_handle_lock);
atomic_sub(handle->h_buffer_credits,
&transaction->t_outstanding_credits);
+ if (handle->h_rsv_handle) {
+ sub_reserved_credits(journal,
+ handle->h_rsv_handle->h_buffer_credits);
+ }
if (atomic_dec_and_test(&transaction->t_updates))
wake_up(&journal->j_wait_updates);
+ tid = transaction->t_tid;
spin_unlock(&transaction->t_handle_lock);
+ handle->h_transaction = NULL;
+ current->journal_info = NULL;
jbd_debug(2, "restarting handle %p\n", handle);
- tid = transaction->t_tid;
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
if (need_to_start)
@@ -509,6 +693,14 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
++journal->j_barrier_count;
+ /* Wait until there are no reserved handles */
+ if (atomic_read(&journal->j_reserved_credits)) {
+ write_unlock(&journal->j_state_lock);
+ wait_event(journal->j_wait_reserved,
+ atomic_read(&journal->j_reserved_credits) == 0);
+ write_lock(&journal->j_state_lock);
+ }
+
/* Wait until there are no running updates */
while (1) {
transaction_t *transaction = journal->j_running_transaction;
@@ -571,6 +763,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
}
+static int sleep_on_shadow_bh(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
/*
* If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior
@@ -586,16 +784,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int force_copy)
{
struct buffer_head *bh;
- transaction_t *transaction;
+ transaction_t *transaction = handle->h_transaction;
journal_t *journal;
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
+ unsigned long start_lock, time_lock;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
return -EROFS;
-
- transaction = handle->h_transaction;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -606,9 +804,16 @@ repeat:
/* @@@ Need to check for errors here at some point. */
+ start_lock = jiffies;
lock_buffer(bh);
jbd_lock_bh_state(bh);
+ /* If it takes too long to lock the buffer, trace it */
+ time_lock = jbd2_time_diff(start_lock, jiffies);
+ if (time_lock > HZ/10)
+ trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
+ jiffies_to_msecs(time_lock));
+
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
@@ -698,41 +903,29 @@ repeat:
* journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */
- if (jh->b_jlist == BJ_Shadow) {
- DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
- wait_queue_head_t *wqh;
-
- wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
-
+ if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh);
- /* commit wakes up all shadow buffers after IO */
- for ( ; ; ) {
- prepare_to_wait(wqh, &wait.wait,
- TASK_UNINTERRUPTIBLE);
- if (jh->b_jlist != BJ_Shadow)
- break;
- schedule();
- }
- finish_wait(wqh, &wait.wait);
+ wait_on_bit(&bh->b_state, BH_Shadow,
+ sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
goto repeat;
}
- /* Only do the copy if the currently-owning transaction
- * still needs it. If it is on the Forget list, the
- * committing transaction is past that stage. The
- * buffer had better remain locked during the kmalloc,
- * but that should be true --- we hold the journal lock
- * still and the buffer is already on the BUF_JOURNAL
- * list so won't be flushed.
+ /*
+ * Only do the copy if the currently-owning transaction still
+ * needs it. If buffer isn't on BJ_Metadata list, the
+ * committing transaction is past that stage (here we use the
+ * fact that BH_Shadow is set under bh_state lock together with
+ * refiling to BJ_Shadow list and at this point we know the
+ * buffer doesn't have BH_Shadow set).
*
* Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy
- * in that case. */
-
- if (jh->b_jlist != BJ_Forget || force_copy) {
+ * in that case.
+ */
+ if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
@@ -741,7 +934,7 @@ repeat:
jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS);
if (!frozen_buffer) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: OOM for frozen_buffer\n",
__func__);
JBUFFER_TRACE(jh, "oom!");
@@ -783,12 +976,12 @@ done:
"Possible IO failure.\n");
page = jh2bh(jh)->b_page;
offset = offset_in_page(jh2bh(jh)->b_data);
- source = kmap_atomic(page, KM_USER0);
+ source = kmap_atomic(page);
/* Fire data frozen trigger just before we copy the data */
jbd2_buffer_frozen_trigger(jh, source + offset,
jh->b_triggers);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
- kunmap_atomic(source, KM_USER0);
+ kunmap_atomic(source);
/*
* Now that the frozen data is saved off, we need to store
@@ -859,14 +1052,16 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh = jbd2_journal_add_journal_head(bh);
int err;
jbd_debug(5, "journal_head %p\n", jh);
+ WARN_ON(!transaction);
err = -EROFS;
if (is_handle_aborted(handle))
goto out;
+ journal = transaction->t_journal;
err = 0;
JBUFFER_TRACE(jh, "entry");
@@ -878,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* reused here.
*/
jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -901,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
jh->b_modified = 0;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
} else if (jh->b_transaction == journal->j_committing_transaction) {
/* first access by this transaction */
jh->b_modified = 0;
JBUFFER_TRACE(jh, "set next transaction");
+ spin_lock(&journal->j_list_lock);
jh->b_next_transaction = transaction;
}
spin_unlock(&journal->j_list_lock);
@@ -973,7 +1169,7 @@ repeat:
if (!jh->b_committed_data) {
committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
- printk(KERN_EMERG "%s: No memory for committed data\n",
+ printk(KERN_ERR "%s: No memory for committed data\n",
__func__);
err = -ENOMEM;
goto out;
@@ -1016,9 +1212,12 @@ out:
void jbd2_journal_set_triggers(struct buffer_head *bh,
struct jbd2_buffer_trigger_type *type)
{
- struct journal_head *jh = bh2jh(bh);
+ struct journal_head *jh = jbd2_journal_grab_journal_head(bh);
+ if (WARN_ON(!jh))
+ return;
jh->b_triggers = type;
+ jbd2_journal_put_journal_head(jh);
}
void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
@@ -1069,18 +1268,21 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- struct journal_head *jh = bh2jh(bh);
+ journal_t *journal;
+ struct journal_head *jh;
int ret = 0;
- jbd_debug(5, "journal_head %p\n", jh);
- JBUFFER_TRACE(jh, "entry");
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- goto out;
- if (!buffer_jbd(bh)) {
+ return -EROFS;
+ journal = transaction->t_journal;
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
ret = -EUCLEAN;
goto out;
}
+ jbd_debug(5, "journal_head %p\n", jh);
+ JBUFFER_TRACE(jh, "entry");
jbd_lock_bh_state(bh);
@@ -1091,7 +1293,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* once a transaction -bzzz
*/
jh->b_modified = 1;
- J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ if (handle->h_buffer_credits <= 0) {
+ ret = -ENOSPC;
+ goto out_unlock_bh;
+ }
handle->h_buffer_credits--;
}
@@ -1106,9 +1311,9 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "fastpath");
if (unlikely(jh->b_transaction !=
journal->j_running_transaction)) {
- printk(KERN_EMERG "JBD: %s: "
+ printk(KERN_ERR "JBD2: %s: "
"jh->b_transaction (%llu, %p, %u) != "
- "journal->j_running_transaction (%p, %u)",
+ "journal->j_running_transaction (%p, %u)\n",
journal->j_devname,
(unsigned long long) bh->b_blocknr,
jh->b_transaction,
@@ -1131,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction != transaction) {
JBUFFER_TRACE(jh, "already on other transaction");
- if (unlikely(jh->b_transaction !=
- journal->j_committing_transaction)) {
- printk(KERN_EMERG "JBD: %s: "
- "jh->b_transaction (%llu, %p, %u) != "
- "journal->j_committing_transaction (%p, %u)",
+ if (unlikely(((jh->b_transaction !=
+ journal->j_committing_transaction)) ||
+ (jh->b_next_transaction != transaction))) {
+ printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
+ "bad jh for block %llu: "
+ "transaction (%p, %u), "
+ "jh->b_transaction (%p, %u), "
+ "jh->b_next_transaction (%p, %u), jlist %u\n",
journal->j_devname,
(unsigned long long) bh->b_blocknr,
+ transaction, transaction->t_tid,
jh->b_transaction,
- jh->b_transaction ? jh->b_transaction->t_tid : 0,
- journal->j_committing_transaction,
- journal->j_committing_transaction ?
- journal->j_committing_transaction->t_tid : 0);
- ret = -EINVAL;
- }
- if (unlikely(jh->b_next_transaction != transaction)) {
- printk(KERN_EMERG "JBD: %s: "
- "jh->b_next_transaction (%llu, %p, %u) != "
- "transaction (%p, %u)",
- journal->j_devname,
- (unsigned long long) bh->b_blocknr,
+ jh->b_transaction ?
+ jh->b_transaction->t_tid : 0,
jh->b_next_transaction,
jh->b_next_transaction ?
jh->b_next_transaction->t_tid : 0,
- transaction, transaction->t_tid);
+ jh->b_jlist);
+ WARN_ON(1);
ret = -EINVAL;
}
/* And this case is illegal: we can't reuse another
@@ -1167,27 +1367,16 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
jbd_unlock_bh_state(bh);
+ jbd2_journal_put_journal_head(jh);
out:
JBUFFER_TRACE(jh, "exit");
- WARN_ON(ret); /* All errors are bugs, so dump the stack */
return ret;
}
-/*
- * jbd2_journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the update decided in the end that it didn't need access.
- *
- */
-void
-jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
- BUFFER_TRACE(bh, "entry");
-}
-
/**
* void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
@@ -1208,16 +1397,20 @@ jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
int was_modified = 0;
+ WARN_ON(!transaction);
+ if (is_handle_aborted(handle))
+ return -EROFS;
+ journal = transaction->t_journal;
+
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
if (!buffer_jbd(bh))
goto not_jbd;
@@ -1231,7 +1424,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
goto not_jbd;
}
- /* keep track of wether or not this transaction modified us */
+ /* keep track of whether or not this transaction modified us */
was_modified = jh->b_modified;
/*
@@ -1240,7 +1433,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
*/
jh->b_modified = 0;
- if (jh->b_transaction == handle->h_transaction) {
+ if (jh->b_transaction == transaction) {
J_ASSERT_JH(jh, !jh->b_frozen_data);
/* If we are forgetting a buffer which is already part
@@ -1270,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
* we know to remove the checkpoint after we commit.
*/
+ spin_lock(&journal->j_list_lock);
if (jh->b_cp_transaction) {
__jbd2_journal_temp_unlink_buffer(jh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1282,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
goto drop;
}
}
+ spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
J_ASSERT_JH(jh, (jh->b_transaction ==
journal->j_committing_transaction));
@@ -1293,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction == transaction);
+ spin_lock(&journal->j_list_lock);
jh->b_next_transaction = NULL;
+ spin_unlock(&journal->j_list_lock);
/*
* only drop a reference if this transaction modified
@@ -1305,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
}
not_jbd:
- spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
@@ -1335,19 +1531,21 @@ drop:
int jbd2_journal_stop(handle_t *handle)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
- int err, wait_for_commit = 0;
+ journal_t *journal;
+ int err = 0, wait_for_commit = 0;
tid_t tid;
pid_t pid;
+ if (!transaction)
+ goto free_and_exit;
+ journal = transaction->t_journal;
+
J_ASSERT(journal_current_handle() == handle);
if (is_handle_aborted(handle))
err = -EIO;
- else {
+ else
J_ASSERT(atomic_read(&transaction->t_updates) > 0);
- err = 0;
- }
if (--handle->h_ref > 0) {
jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1356,6 +1554,13 @@ int jbd2_journal_stop(handle_t *handle)
}
jbd_debug(4, "Handle %p going down\n", handle);
+ trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev,
+ transaction->t_tid,
+ handle->h_type, handle->h_line_no,
+ jiffies - handle->h_start_jiffies,
+ handle->h_sync, handle->h_requested_credits,
+ (handle->h_requested_credits -
+ handle->h_buffer_credits));
/*
* Implement synchronous transaction batching. If the handle
@@ -1383,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle)
* to perform a synchronous write. We do this to detect the
* case where a single process is doing a stream of sync
* writes. No point in waiting for joiners in that case.
+ *
+ * Setting max_batch_time to 0 disables this completely.
*/
pid = current->pid;
- if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ if (handle->h_sync && journal->j_last_sync_writer != pid &&
+ journal->j_max_batch_time) {
u64 commit_time, trans_time;
journal->j_last_sync_writer = pid;
@@ -1461,33 +1669,13 @@ int jbd2_journal_stop(handle_t *handle)
lock_map_release(&handle->h_lockdep_map);
+ if (handle->h_rsv_handle)
+ jbd2_journal_free_reserved(handle->h_rsv_handle);
+free_and_exit:
jbd2_free_handle(handle);
return err;
}
-/**
- * int jbd2_journal_force_commit() - force any uncommitted transactions
- * @journal: journal to force
- *
- * For synchronous operations: force any uncommitted transactions
- * to disk. May seem kludgy, but it reuses all the handle batching
- * code in a very simple manner.
- */
-int jbd2_journal_force_commit(journal_t *journal)
-{
- handle_t *handle;
- int ret;
-
- handle = jbd2_journal_start(journal, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- } else {
- handle->h_sync = 1;
- ret = jbd2_journal_stop(handle);
- }
- return ret;
-}
-
/*
*
* List management code snippets: various functions for manipulating the
@@ -1544,14 +1732,14 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
- * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
- * of these pointers, it could go bad. Generally the caller needs to re-read
- * the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_shadow_list, t_log_list or
+ * t_reserved_list. If the caller is holding onto a copy of one of these
+ * pointers, it could go bad. Generally the caller needs to re-read the
+ * pointer from the transaction_t.
*
- * Called under j_list_lock. The journal may not be locked.
+ * Called under j_list_lock.
*/
-void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
{
struct journal_head **list = NULL;
transaction_t *transaction;
@@ -1577,15 +1765,9 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -1640,16 +1822,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
- if (jh->b_next_transaction != NULL)
+ if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL) {
/* written-back checkpointed metadata buffer */
- if (jh->b_jlist == BJ_None) {
- JBUFFER_TRACE(jh, "remove from checkpoint list");
- __jbd2_journal_remove_checkpoint(jh);
- }
+ JBUFFER_TRACE(jh, "remove from checkpoint list");
+ __jbd2_journal_remove_checkpoint(jh);
}
spin_unlock(&journal->j_list_lock);
out:
@@ -1813,12 +1993,12 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
* We're outside-transaction here. Either or both of j_running_transaction
* and j_committing_transaction may be NULL.
*/
-static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
+ int partial_page)
{
transaction_t *transaction;
struct journal_head *jh;
int may_free = 1;
- int ret;
BUFFER_TRACE(bh, "entry");
@@ -1850,10 +2030,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* clear the buffer dirty bit at latest at the moment when the
* transaction marking the buffer as freed in the filesystem
* structures is committed because from that moment on the
- * buffer can be reallocated and used by a different page.
+ * block can be reallocated and used by a different page.
* Since the block hasn't been freed yet but the inode has
* already been added to orphan list, it is safe for us to add
* the buffer to BJ_Forget list of the newest transaction.
+ *
+ * Also we have to clear buffer_mapped flag of a truncated buffer
+ * because the buffer_head may be attached to the page straddling
+ * i_size (can happen only when blocksize < pagesize) and thus the
+ * buffer_head can be reused when the file is extended again. So we end
+ * up keeping around invalidated buffers attached to transactions'
+ * BJ_Forget list just to stop checkpointing code from cleaning up
+ * the transaction this buffer was modified in.
*/
transaction = jh->b_transaction;
if (transaction == NULL) {
@@ -1880,13 +2068,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* committed, the buffer won't be needed any
* longer. */
JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_running_transaction);
- jbd2_journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- write_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* There is no currently-running transaction. So the
* orphan record which we wrote for this file must have
@@ -1894,13 +2078,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* the committing transaction, if it exists. */
if (journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "give to committing trans");
- ret = __dispose_buffer(jh,
+ may_free = __dispose_buffer(jh,
journal->j_committing_transaction);
- jbd2_journal_put_journal_head(jh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- write_unlock(&journal->j_state_lock);
- return ret;
+ goto zap_buffer;
} else {
/* The orphan record's transaction has
* committed. We can cleanse this buffer */
@@ -1912,10 +2092,21 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
JBUFFER_TRACE(jh, "on committing transaction");
/*
* The buffer is committing, we simply cannot touch
- * it. So we just set j_next_transaction to the
- * running transaction (if there is one) and mark
- * buffer as freed so that commit code knows it should
- * clear dirty bits when it is done with the buffer.
+ * it. If the page is straddling i_size we have to wait
+ * for commit and try again.
+ */
+ if (partial_page) {
+ jbd2_journal_put_journal_head(jh);
+ spin_unlock(&journal->j_list_lock);
+ jbd_unlock_bh_state(bh);
+ write_unlock(&journal->j_state_lock);
+ return -EBUSY;
+ }
+ /*
+ * OK, buffer won't be reachable after truncate. We just set
+ * j_next_transaction to the running transaction (if there is
+ * one) and mark buffer as freed so that commit code knows it
+ * should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
@@ -1938,6 +2129,15 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
zap_buffer:
+ /*
+ * This is tricky. Although the buffer is truncated, it may be reused
+ * if blocksize < pagesize and it is attached to the page straddling
+ * EOF. Since the buffer might have been added to BJ_Forget list of the
+ * running transaction, journal_get_write_access() won't clear
+ * b_modified and credit accounting gets confused. So clear b_modified
+ * here.
+ */
+ jh->b_modified = 0;
jbd2_journal_put_journal_head(jh);
zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
@@ -1949,6 +2149,8 @@ zap_buffer_unlocked:
clear_buffer_mapped(bh);
clear_buffer_req(bh);
clear_buffer_new(bh);
+ clear_buffer_delay(bh);
+ clear_buffer_unwritten(bh);
bh->b_bdev = NULL;
return may_free;
}
@@ -1957,23 +2159,32 @@ zap_buffer_unlocked:
* void jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
- * @offset: length of page to invalidate.
- *
- * Reap page buffers containing data after offset in page.
+ * @offset: start of the range to invalidate
+ * @length: length of the range to invalidate
*
+ * Reap page buffers containing data after in the specified range in page.
+ * Can return -EBUSY if buffers are part of the committing transaction and
+ * the page is straddling i_size. Caller then has to wait for current commit
+ * and try again.
*/
-void jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned long offset)
+int jbd2_journal_invalidatepage(journal_t *journal,
+ struct page *page,
+ unsigned int offset,
+ unsigned int length)
{
struct buffer_head *head, *bh, *next;
+ unsigned int stop = offset + length;
unsigned int curr_off = 0;
+ int partial_page = (offset || length < PAGE_CACHE_SIZE);
int may_free = 1;
+ int ret = 0;
if (!PageLocked(page))
BUG();
if (!page_has_buffers(page))
- return;
+ return 0;
+
+ BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
@@ -1984,21 +2195,28 @@ void jbd2_journal_invalidatepage(journal_t *journal,
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
+ if (next_off > stop)
+ return 0;
+
if (offset <= curr_off) {
/* This block is wholly outside the truncation point */
lock_buffer(bh);
- may_free &= journal_unmap_buffer(journal, bh);
+ ret = journal_unmap_buffer(journal, bh, partial_page);
unlock_buffer(bh);
+ if (ret < 0)
+ return ret;
+ may_free &= ret;
}
curr_off = next_off;
bh = next;
} while (bh != head);
- if (!offset) {
+ if (!partial_page) {
if (may_free && try_to_free_buffers(page))
J_ASSERT(!page_has_buffers(page));
}
+ return 0;
}
/*
@@ -2055,15 +2273,9 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Forget:
list = &transaction->t_forget;
break;
- case BJ_IO:
- list = &transaction->t_iobuf_list;
- break;
case BJ_Shadow:
list = &transaction->t_shadow_list;
break;
- case BJ_LogCtl:
- list = &transaction->t_log_list;
- break;
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
@@ -2165,10 +2377,12 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
{
transaction_t *transaction = handle->h_transaction;
- journal_t *journal = transaction->t_journal;
+ journal_t *journal;
+ WARN_ON(!transaction);
if (is_handle_aborted(handle))
- return -EIO;
+ return -EROFS;
+ journal = transaction->t_journal;
jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
transaction->t_tid);