diff options
Diffstat (limited to 'fs/jbd')
| -rw-r--r-- | fs/jbd/checkpoint.c | 23 | ||||
| -rw-r--r-- | fs/jbd/commit.c | 91 | ||||
| -rw-r--r-- | fs/jbd/journal.c | 279 | ||||
| -rw-r--r-- | fs/jbd/recovery.c | 7 | ||||
| -rw-r--r-- | fs/jbd/revoke.c | 12 | ||||
| -rw-r--r-- | fs/jbd/transaction.c | 104 |
6 files changed, 347 insertions, 169 deletions
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 05f0754f2b4..08c03044abd 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -508,20 +508,19 @@ int cleanup_journal_tail(journal_t *journal) /* * We need to make sure that any blocks that were recently written out * --- perhaps by log_do_checkpoint() --- are flushed out before we - * drop the transactions from the journal. It's unlikely this will be - * necessary, especially with an appropriately sized journal, but we - * need this to guarantee correctness. Fortunately - * cleanup_journal_tail() doesn't get called all that often. + * drop the transactions from the journal. Similarly we need to be sure + * superblock makes it to disk before next transaction starts reusing + * freed space (otherwise we could replay some blocks of the new + * transaction thinking they belong to the old one). So we use + * WRITE_FLUSH_FUA. It's unlikely this will be necessary, especially + * with an appropriately sized journal, but we need this to guarantee + * correctness. Fortunately cleanup_journal_tail() doesn't get called + * all that often. */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + journal_update_sb_log_tail(journal, first_tid, blocknr, + WRITE_FLUSH_FUA); spin_lock(&journal->j_state_lock); - if (!tid_gt(first_tid, journal->j_tail_sequence)) { - spin_unlock(&journal->j_state_lock); - /* Someone else cleaned up journal so return 0 */ - return 0; - } /* OK, update the superblock to recover the freed space. * Physical blocks come first: have we wrapped beyond the end of * the log? */ @@ -539,8 +538,6 @@ int cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); - if (!(journal->j_flags & JFS_ABORT)) - journal_update_superblock(journal, 1); return 0; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f2b9a571f4c..bb217dcb41a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -86,7 +86,12 @@ nope: static void release_data_buffer(struct buffer_head *bh) { if (buffer_freed(bh)) { + WARN_ON_ONCE(buffer_dirty(bh)); clear_buffer_freed(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; release_buffer_page(bh); } else put_bh(bh); @@ -157,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs, for (i = 0; i < bufs; i++) { wbuf[i]->b_end_io = end_buffer_write_sync; - /* We use-up our safety reference in submit_bh() */ - submit_bh(write_op, wbuf[i]); + /* + * Here we write back pagecache data that may be mmaped. Since + * we cannot afford to clean the page and set PageWriteback + * here due to lock ordering (page lock ranks above transaction + * start), the data can change while IO is in flight. Tell the + * block layer it should bounce the bio pages if stable data + * during write is required. + * + * We use up our safety reference in submit_bh(). + */ + _submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE); } } @@ -298,6 +312,7 @@ void journal_commit_transaction(journal_t *journal) int tag_flag; int i; struct blk_plug plug; + int write_op = WRITE; /* * First job: lock down the current transaction and wait for @@ -307,7 +322,16 @@ void journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); - journal_update_superblock(journal, 1); + mutex_lock(&journal->j_checkpoint_mutex); + /* + * We hold j_checkpoint_mutex so tail cannot change under us. + * We don't need any special data guarantees for writing sb + * since journal is empty and it is ok for write to be + * flushed only with transaction commit. + */ + journal_update_sb_log_tail(journal, journal->j_tail_sequence, + journal->j_tail, WRITE_SYNC); + mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); } @@ -316,13 +340,13 @@ void journal_commit_transaction(journal_t *journal) J_ASSERT(journal->j_committing_transaction == NULL); commit_transaction = journal->j_running_transaction; - J_ASSERT(commit_transaction->t_state == T_RUNNING); trace_jbd_start_commit(journal, commit_transaction); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); spin_lock(&journal->j_state_lock); + J_ASSERT(commit_transaction->t_state == T_RUNNING); commit_transaction->t_state = T_LOCKED; trace_jbd_commit_locking(journal, commit_transaction); @@ -413,13 +437,16 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); + if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid)) + write_op = WRITE_SYNC; + /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ blk_start_plug(&plug); err = journal_submit_data_buffers(journal, commit_transaction, - WRITE_SYNC); + write_op); blk_finish_plug(&plug); /* @@ -478,7 +505,7 @@ void journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); - journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have @@ -649,7 +676,17 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + /* + * In data=journal mode, here we can end up + * writing pagecache data that might be + * mmapped. Since we can't afford to clean the + * page and set PageWriteback (see the comment + * near the other use of _submit_bh()), the + * data can change while the write is in + * flight. Tell the block layer to bounce the + * bio pages if stable pages are required. + */ + _submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE); } cond_resched(); @@ -853,17 +890,35 @@ restart_loop: * there's no point in keeping a checkpoint record for * it. */ - /* A buffer which has been freed while still being - * journaled by a previous transaction may end up still - * being dirty here, but we want to avoid writing back - * that buffer in the future after the "add to orphan" - * operation been committed, That's not only a performance - * gain, it also stops aliasing problems if the buffer is - * left behind for writeback and gets reallocated for another - * use in a different page. */ - if (buffer_freed(bh) && !jh->b_next_transaction) { - clear_buffer_freed(bh); - clear_buffer_jbddirty(bh); + /* + * A buffer which has been freed while still being journaled by + * a previous transaction. + */ + if (buffer_freed(bh)) { + /* + * If the running transaction is the one containing + * "add to orphan" operation (b_next_transaction != + * NULL), we have to wait for that transaction to + * commit before we can really get rid of the buffer. + * So just clear b_modified to not confuse transaction + * credit accounting and refile the buffer to + * BJ_Forget of the running transaction. If the just + * committed transaction contains "add to orphan" + * operation, we can completely invalidate the buffer + * now. We are rather throughout in that since the + * buffer may be still accessible when blocksize < + * pagesize and it is attached to the last partial + * page. + */ + jh->b_modified = 0; + if (!jh->b_next_transaction) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_req(bh); + bh->b_bdev = NULL; + } } if (buffer_jbddirty(bh)) { diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 59c09f9541b..06fe11e0abf 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -90,6 +90,24 @@ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); static const char *journal_dev_name(journal_t *journal, char *buffer); +#ifdef CONFIG_JBD_DEBUG +void __jbd_debug(int level, const char *file, const char *func, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (level > journal_enable_debug) + return; + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + va_end(args); +} +EXPORT_SYMBOL(__jbd_debug); +#endif + /* * Helper function used to manage commit timeouts */ @@ -129,6 +147,8 @@ static int kjournald(void *arg) setup_timer(&journal->j_commit_timer, commit_timeout, (unsigned long)current); + set_freezable(); + /* Record that the journal thread is running */ journal->j_task = current; wake_up(&journal->j_wait_done_commit); @@ -308,8 +328,6 @@ int journal_write_metadata_buffer(transaction_t *transaction, new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); atomic_set(&new_bh->b_count, 1); new_jh = journal_add_journal_head(new_bh); /* This sleeps */ @@ -328,7 +346,7 @@ repeat: new_offset = offset_in_page(jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); /* * Check for escaping */ @@ -337,7 +355,7 @@ repeat: need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); /* * Do we need to do a data copy? @@ -354,9 +372,9 @@ repeat: } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size); - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); new_page = virt_to_page(tmp); new_offset = offset_in_page(tmp); @@ -368,9 +386,9 @@ repeat: * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page, KM_USER0); + mapped_data = kmap_atomic(new_page); *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data, KM_USER0); + kunmap_atomic(mapped_data); } set_bh_page(new_bh, new_page, new_offset); @@ -444,7 +462,8 @@ int __log_start_commit(journal_t *journal, tid_t target) * currently running transaction (if it exists). Otherwise, * the target tid must be an old one. */ - if (journal->j_running_transaction && + if (journal->j_commit_request != target && + journal->j_running_transaction && journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the @@ -532,8 +551,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) ret = 1; } else if (journal->j_committing_transaction) { /* - * If ext3_write_super() recently started a commit, then we - * have to wait for completion of that transaction + * If commit has been started, then we have to wait for + * completion of that transaction. */ if (ptid) *ptid = journal->j_committing_transaction->t_tid; @@ -554,13 +573,25 @@ int log_wait_commit(journal_t *journal, tid_t tid) #ifdef CONFIG_JBD_DEBUG spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { - printk(KERN_EMERG + printk(KERN_ERR "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + /* + * Not running or committing trans? Must be already committed. This + * saves us from waiting for a *long* time when tid overflows. + */ + if (!((journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) || + (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid))) + goto out_unlock; + + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); @@ -570,12 +601,11 @@ int log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); spin_lock(&journal->j_state_lock); } +out_unlock: spin_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) { - printk(KERN_EMERG "journal commit I/O error\n"); + if (unlikely(is_journal_aborted(journal))) err = -EIO; - } return err; } @@ -919,8 +949,33 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); + mutex_unlock(&journal->j_checkpoint_mutex); + } return journal_start_thread(journal); } @@ -997,35 +1052,15 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void journal_update_superblock(journal_t *journal, int wait) +static void journal_write_superblock(journal_t *journal, int write_op) { - journal_superblock_t *sb = journal->j_superblock; struct buffer_head *bh = journal->j_sb_buffer; + int ret; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } - + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1043,42 +1078,105 @@ void journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); + + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); + + journal_write_superblock(journal, write_op); + + /* Log is no longer empty */ spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); + spin_lock(&journal->j_state_lock); + /* Is it already empty? */ + if (sb->s_start == 0) { + spin_unlock(&journal->j_state_lock); + return; + } + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); + sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); + journal_write_superblock(journal, WRITE_FUA); - trace_jbd_update_superblock_end(journal, wait); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal, WRITE_SYNC); } /* @@ -1249,6 +1347,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1264,16 +1364,14 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1453,7 +1551,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned int old_tail; spin_lock(&journal->j_state_lock); @@ -1488,6 +1585,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1495,14 +1593,9 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1542,8 +1635,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) - journal_update_superblock(journal, 1); + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); + mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; @@ -1611,7 +1708,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** @@ -1773,7 +1870,7 @@ static struct journal_head *journal_alloc_journal_head(void) #ifdef CONFIG_JBD_DEBUG atomic_inc(&nr_journal_heads); #endif - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); if (ret == NULL) { jbd_debug(1, "out of memory for journal_head\n"); printk_ratelimited(KERN_NOTICE "ENOMEM in %s, retrying.\n", @@ -1781,7 +1878,7 @@ static struct journal_head *journal_alloc_journal_head(void) while (ret == NULL) { yield(); - ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + ret = kmem_cache_zalloc(journal_head_cache, GFP_NOFS); } } return ret; @@ -1843,10 +1940,8 @@ struct journal_head *journal_add_journal_head(struct buffer_head *bh) struct journal_head *new_jh = NULL; repeat: - if (!buffer_jbd(bh)) { + if (!buffer_jbd(bh)) new_jh = journal_alloc_journal_head(); - memset(new_jh, 0, sizeof(*new_jh)); - } jbd_lock_bh_journal_head(bh); if (buffer_jbd(bh)) { @@ -2039,7 +2134,7 @@ static void __exit journal_exit(void) #ifdef CONFIG_JBD_DEBUG int n = atomic_read(&nr_journal_heads); if (n) - printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n); #endif jbd_remove_debugfs_entry(); journal_destroy_caches(); diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 008bf062fd2..a748fe21465 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -265,8 +265,11 @@ int journal_recover(journal_t *journal) if (!err) err = err2; /* Flush disk caches to get replayed data on the permanent storage */ - if (journal->j_flags & JFS_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (journal->j_flags & JFS_BARRIER) { + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + if (!err) + err = err2; + } return err; } diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 25c713e7071..8898bbd2b61 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -231,19 +231,15 @@ record_cache_failure: static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) { - int shift = 0; - int tmp = hash_size; + int i; struct jbd_revoke_table_s *table; table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); if (!table) goto out; - while((tmp >>= 1UL) != 0UL) - shift++; - table->hash_size = hash_size; - table->hash_shift = shift; + table->hash_shift = ilog2(hash_size); table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { @@ -252,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size) goto out; } - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&table->hash_table[tmp]); + for (i = 0; i < hash_size; i++) + INIT_LIST_HEAD(&table->hash_table[i]); out: return table; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 7fce94b04bc..1695ba8334a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -26,7 +26,6 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> -#include <linux/backing-dev.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -100,10 +99,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle) alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS); + new_transaction = kzalloc(sizeof(*new_transaction), + GFP_NOFS|__GFP_NOFAIL); if (!new_transaction) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; + ret = -ENOMEM; + goto out; } } @@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks) handle_t *handle = jbd_alloc_handle(GFP_NOFS); if (!handle) return NULL; - memset(handle, 0, sizeof(*handle)); handle->h_buffer_credits = nblocks; handle->h_ref = 1; @@ -676,7 +675,7 @@ repeat: jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - printk(KERN_EMERG + printk(KERN_ERR "%s: OOM for frozen_buffer\n", __func__); JBUFFER_TRACE(jh, "oom!"); @@ -718,9 +717,9 @@ done: "Possible IO failure.\n"); page = jh2bh(jh)->b_page; offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page, KM_USER0); + source = kmap_atomic(page); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source, KM_USER0); + kunmap_atomic(source); } jbd_unlock_bh_state(bh); @@ -899,7 +898,7 @@ repeat: if (!jh->b_committed_data) { committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { - printk(KERN_EMERG "%s: No memory for committed data\n", + printk(KERN_ERR "%s: No memory for committed data\n", __func__); err = -ENOMEM; goto out; @@ -1259,7 +1258,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } - /* keep track of wether or not this transaction modified us */ + /* keep track of whether or not this transaction modified us */ was_modified = jh->b_modified; /* @@ -1433,8 +1432,6 @@ int journal_stop(handle_t *handle) } } - if (handle->h_sync) - transaction->t_synchronous_commit = 1; current->journal_info = NULL; spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); @@ -1845,15 +1842,16 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) * We're outside-transaction here. Either or both of j_running_transaction * and j_committing_transaction may be NULL. */ -static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, + int partial_page) { transaction_t *transaction; struct journal_head *jh; int may_free = 1; - int ret; BUFFER_TRACE(bh, "entry"); +retry: /* * It is safe to proceed here without the j_list_lock because the * buffers cannot be stolen by try_to_free_buffers as long as we are @@ -1881,10 +1879,18 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * clear the buffer dirty bit at latest at the moment when the * transaction marking the buffer as freed in the filesystem * structures is committed because from that moment on the - * buffer can be reallocated and used by a different page. + * block can be reallocated and used by a different page. * Since the block hasn't been freed yet but the inode has * already been added to orphan list, it is safe for us to add * the buffer to BJ_Forget list of the newest transaction. + * + * Also we have to clear buffer_mapped flag of a truncated buffer + * because the buffer_head may be attached to the page straddling + * i_size (can happen only when blocksize < pagesize) and thus the + * buffer_head can be reused when the file is extended again. So we end + * up keeping around invalidated buffers attached to transactions' + * BJ_Forget list just to stop checkpointing code from cleaning up + * the transaction this buffer was modified in. */ transaction = jh->b_transaction; if (transaction == NULL) { @@ -1911,13 +1917,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * committed, the buffer won't be needed any * longer. */ JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_running_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* There is no currently-running transaction. So the * orphan record which we wrote for this file must have @@ -1925,13 +1927,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * the committing transaction, if it exists. */ if (journal->j_committing_transaction) { JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, + may_free = __dispose_buffer(jh, journal->j_committing_transaction); - journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); - return ret; + goto zap_buffer; } else { /* The orphan record's transaction has * committed. We can cleanse this buffer */ @@ -1952,10 +1950,26 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } /* * The buffer is committing, we simply cannot touch - * it. So we just set j_next_transaction to the - * running transaction (if there is one) and mark - * buffer as freed so that commit code knows it should - * clear dirty bits when it is done with the buffer. + * it. If the page is straddling i_size we have to wait + * for commit and try again. + */ + if (partial_page) { + tid_t tid = journal->j_committing_transaction->t_tid; + + journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + unlock_buffer(bh); + log_wait_commit(journal, tid); + lock_buffer(bh); + goto retry; + } + /* + * OK, buffer won't be reachable after truncate. We just set + * j_next_transaction to the running transaction (if there is + * one) and mark buffer as freed so that commit code knows it + * should clear dirty bits when it is done with the buffer. */ set_buffer_freed(bh); if (journal->j_running_transaction && buffer_jbddirty(bh)) @@ -1978,6 +1992,14 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) } zap_buffer: + /* + * This is tricky. Although the buffer is truncated, it may be reused + * if blocksize < pagesize and it is attached to the page straddling + * EOF. Since the buffer might have been added to BJ_Forget list of the + * running transaction, journal_get_write_access() won't clear + * b_modified and credit accounting gets confused. So clear b_modified + * here. */ + jh->b_modified = 0; journal_put_journal_head(jh); zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); @@ -1997,16 +2019,20 @@ zap_buffer_unlocked: * void journal_invalidatepage() - invalidate a journal page * @journal: journal to use for flush * @page: page to flush - * @offset: length of page to invalidate. + * @offset: offset of the range to invalidate + * @length: length of the range to invalidate * - * Reap page buffers containing data after offset in page. + * Reap page buffers containing data in specified range in page. */ void journal_invalidatepage(journal_t *journal, struct page *page, - unsigned long offset) + unsigned int offset, + unsigned int length) { struct buffer_head *head, *bh, *next; + unsigned int stop = offset + length; unsigned int curr_off = 0; + int partial_page = (offset || length < PAGE_CACHE_SIZE); int may_free = 1; if (!PageLocked(page)) @@ -2014,6 +2040,8 @@ void journal_invalidatepage(journal_t *journal, if (!page_has_buffers(page)) return; + BUG_ON(stop > PAGE_CACHE_SIZE || stop < length); + /* We will potentially be playing with lists other than just the * data lists (especially for journaled data mode), so be * cautious in our locking. */ @@ -2023,10 +2051,14 @@ void journal_invalidatepage(journal_t *journal, unsigned int next_off = curr_off + bh->b_size; next = bh->b_this_page; + if (next_off > stop) + return; + if (offset <= curr_off) { /* This block is wholly outside the truncation point */ lock_buffer(bh); - may_free &= journal_unmap_buffer(journal, bh); + may_free &= journal_unmap_buffer(journal, bh, + partial_page); unlock_buffer(bh); } curr_off = next_off; @@ -2034,7 +2066,7 @@ void journal_invalidatepage(journal_t *journal, } while (bh != head); - if (!offset) { + if (!partial_page) { if (may_free && try_to_free_buffers(page)) J_ASSERT(!page_has_buffers(page)); } |
