diff options
Diffstat (limited to 'fs/ext3/inode.c')
| -rw-r--r-- | fs/ext3/inode.c | 257 |
1 files changed, 115 insertions, 142 deletions
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2d0afeca0b4..2c6ccc49ba2 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -22,22 +22,13 @@ * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 */ -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/ext3_jbd.h> -#include <linux/jbd.h> #include <linux/highuid.h> -#include <linux/pagemap.h> #include <linux/quotaops.h> -#include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/mpage.h> -#include <linux/uio.h> -#include <linux/bio.h> -#include <linux/fiemap.h> #include <linux/namei.h> -#include <trace/events/ext3.h> +#include <linux/aio.h> +#include "ext3.h" #include "xattr.h" #include "acl.h" @@ -228,7 +219,8 @@ void ext3_evict_inode (struct inode *inode) */ if (inode->i_nlink && ext3_should_journal_data(inode) && EXT3_SB(inode->i_sb)->s_journal && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && + inode->i_ino != EXT3_JOURNAL_INO) { tid_t commit_tid = atomic_read(&ei->i_datasync_tid); journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; @@ -236,7 +228,7 @@ void ext3_evict_inode (struct inode *inode) log_wait_commit(journal, commit_tid); filemap_write_and_wait(&inode->i_data); } - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); ext3_discard_reservation(inode); rsv = ei->i_block_alloc_info; @@ -282,18 +274,18 @@ void ext3_evict_inode (struct inode *inode) if (ext3_mark_inode_dirty(handle, inode)) { /* If that failed, just dquot_drop() and be done with that */ dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); } else { ext3_xattr_delete_inode(handle, inode); dquot_free_inode(inode); dquot_drop(inode); - end_writeback(inode); + clear_inode(inode); ext3_free_inode(handle, inode); } ext3_journal_stop(handle); return; no_delete: - end_writeback(inode); + clear_inode(inode); dquot_drop(inode); } @@ -686,6 +678,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode, * parent to disk. */ bh = sb_getblk(inode->i_sb, new_blocks[n-1]); + if (unlikely(!bh)) { + err = -ENOMEM; + goto failed; + } branch[n].bh = bh; lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); @@ -727,7 +723,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call journal_forget"); ext3_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext3_free_blocks(handle, inode, new_blocks[i], 1); ext3_free_blocks(handle, inode, new_blocks[i], num); @@ -756,6 +752,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, struct ext3_block_alloc_info *block_i; ext3_fsblk_t current_block; struct ext3_inode_info *ei = EXT3_I(inode); + struct timespec now; block_i = ei->i_block_alloc_info; /* @@ -795,9 +792,11 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, } /* We are done with atomic stuff, now do the rest of housekeeping */ - - inode->i_ctime = CURRENT_TIME_SEC; - ext3_mark_inode_dirty(handle, inode); + now = CURRENT_TIME_SEC; + if (!timespec_equal(&inode->i_ctime, &now) || !where->bh) { + inode->i_ctime = now; + ext3_mark_inode_dirty(handle, inode); + } /* ext3_mark_inode_dirty already updated i_sync_tid */ atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); @@ -1078,16 +1077,15 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, * mapped. 0 in case of a HOLE. */ if (err > 0) { - if (err > 1) - WARN_ON(1); + WARN_ON(err > 1); err = 0; } *errp = err; if (!err && buffer_mapped(&dummy)) { struct buffer_head *bh; bh = sb_getblk(inode->i_sb, dummy.b_blocknr); - if (!bh) { - *errp = -EIO; + if (unlikely(!bh)) { + *errp = -ENOMEM; goto err; } if (buffer_new(&dummy)) { @@ -1561,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh) } /* - * Note that we always start a transaction even if we're not journalling - * data. This is to preserve ordering: any hole instantiation within - * __block_write_full_page -> ext3_get_block() should be journalled - * along with the data so we don't crash and then get metadata which + * Note that whenever we need to map blocks we start a transaction even if + * we're not journalling data. This is to preserve ordering: any hole + * instantiation within __block_write_full_page -> ext3_get_block() should be + * journalled along with the data so we don't crash and then get metadata which * refers to old data. * * In all journalling modes block_write_full_page() will start the I/O. * - * Problem: - * - * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext3_writepage() - * - * Similar for: - * - * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext3_get_block(). We will deadlock on various things like - * lock_journal and i_truncate_mutex. - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * * We don't honour synchronous mounts for writepage(). That would be * disastrous. Any write() or metadata operation will sync the fs for * us. - * - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, - * we don't need to open a transaction here. */ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) @@ -1675,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page, * block_write_full_page() succeeded. Otherwise they are unmapped, * and generally junk. */ - if (ret == 0) { - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, + if (ret == 0) + ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_data_fn); - if (!ret) - ret = err; - } walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, bput_one); err = ext3_journal_stop(handle); @@ -1760,17 +1716,17 @@ static int ext3_journalled_writepage(struct page *page, WARN_ON_ONCE(IS_RDONLY(inode) && !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS)); - if (ext3_journal_current_handle()) - goto no_write; - trace_ext3_journalled_writepage(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto no_write; - } - if (!page_has_buffers(page) || PageChecked(page)) { + if (ext3_journal_current_handle()) + goto no_write; + + handle = ext3_journal_start(inode, + ext3_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto no_write; + } /* * It's mmapped pagecache. Add buffers and journal it. There * doesn't seem much point in redirtying the page here. @@ -1793,17 +1749,18 @@ static int ext3_journalled_writepage(struct page *page, atomic_set(&EXT3_I(inode)->i_datasync_tid, handle->h_transaction->t_tid); unlock_page(page); + err = ext3_journal_stop(handle); + if (!ret) + ret = err; } else { /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. + * It is a page full of checkpoint-mode buffers. Go and write + * them. They should have been already mapped when they went + * to the journal so provide NULL get_block function to catch + * errors. */ - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, NULL, wbc); } - err = ext3_journal_stop(handle); - if (!ret) - ret = err; out: return ret; @@ -1827,19 +1784,20 @@ ext3_readpages(struct file *file, struct address_space *mapping, return mpage_readpages(mapping, pages, nr_pages, ext3_get_block); } -static void ext3_invalidatepage(struct page *page, unsigned long offset) +static void ext3_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { journal_t *journal = EXT3_JOURNAL(page->mapping->host); - trace_ext3_invalidatepage(page, offset); + trace_ext3_invalidatepage(page, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ - if (offset == 0) + if (offset == 0 && length == PAGE_CACHE_SIZE) ClearPageChecked(page); - journal_invalidatepage(journal, page, offset); + journal_invalidatepage(journal, page, offset, length); } static int ext3_releasepage(struct page *page, gfp_t wait) @@ -1863,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * VFS code falls back into buffered path in that case so we are safe. */ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) + struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1872,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, handle_t *handle; ssize_t ret; int orphan = 0; - size_t count = iov_length(iov, nr_segs); + size_t count = iov_iter_count(iter); int retries = 0; - trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); + trace_ext3_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE) { loff_t final_size = offset + count; @@ -1899,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs, - ext3_get_block); + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + iov_length(iov, nr_segs); + loff_t end = offset + count; if (end > isize) ext3_truncate_failed_direct_write(inode); @@ -1926,6 +1882,8 @@ retry: * and pretend the write failed... */ ext3_truncate_failed_direct_write(inode); ret = PTR_ERR(handle); + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); goto out; } if (inode->i_nlink) @@ -1950,8 +1908,7 @@ retry: ret = err; } out: - trace_ext3_direct_IO_exit(inode, offset, - iov_length(iov, nr_segs), rw, ret); + trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); return ret; } @@ -1986,6 +1943,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; @@ -2737,12 +2695,12 @@ static int __ext3_get_inode_loc(struct inode *inode, return -EIO; bh = sb_getblk(inode->i_sb, block); - if (!bh) { + if (unlikely(!bh)) { ext3_error (inode->i_sb, "ext3_get_inode_loc", "unable to read inode block - " "inode=%lu, block="E3FSBLK, inode->i_ino, block); - return -EIO; + return -ENOMEM; } if (!buffer_uptodate(bh)) { lock_buffer(bh); @@ -2791,7 +2749,7 @@ static int __ext3_get_inode_loc(struct inode *inode, bitmap_bh = sb_getblk(inode->i_sb, le32_to_cpu(desc->bg_inode_bitmap)); - if (!bitmap_bh) + if (unlikely(!bitmap_bh)) goto make_io; /* @@ -2898,6 +2856,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; long ret; int block; + uid_t i_uid; + gid_t i_gid; inode = iget_locked(sb, ino); if (!inode) @@ -2914,12 +2874,14 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) bh = iloc.bh; raw_inode = ext3_raw_inode(&iloc); inode->i_mode = le16_to_cpu(raw_inode->i_mode); - inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); - inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if(!(test_opt (inode->i_sb, NO_UID32))) { - inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; - inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } + i_uid_write(inode, i_uid); + i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); @@ -3075,6 +3037,10 @@ static int ext3_do_update_inode(handle_t *handle, struct ext3_inode_info *ei = EXT3_I(inode); struct buffer_head *bh = iloc->bh; int err = 0, rc, block; + int need_datasync = 0; + __le32 disksize; + uid_t i_uid; + gid_t i_gid; again: /* we can't allow multiple procs in here at once, its a bit racey */ @@ -3087,32 +3053,38 @@ again: ext3_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); + i_uid = i_uid_read(inode); + i_gid = i_gid_read(inode); if(!(test_opt(inode->i_sb, NO_UID32))) { - raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); - raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ if(!ei->i_dtime) { raw_inode->i_uid_high = - cpu_to_le16(high_16_bits(inode->i_uid)); + cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = - cpu_to_le16(high_16_bits(inode->i_gid)); + cpu_to_le16(high_16_bits(i_gid)); } else { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } } else { raw_inode->i_uid_low = - cpu_to_le16(fs_high2lowuid(inode->i_uid)); + cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = - cpu_to_le16(fs_high2lowgid(inode->i_gid)); + cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - raw_inode->i_size = cpu_to_le32(ei->i_disksize); + disksize = cpu_to_le32(ei->i_disksize); + if (disksize != raw_inode->i_size) { + need_datasync = 1; + raw_inode->i_size = disksize; + } raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); @@ -3128,8 +3100,11 @@ again: if (!S_ISREG(inode->i_mode)) { raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl); } else { - raw_inode->i_size_high = - cpu_to_le32(ei->i_disksize >> 32); + disksize = cpu_to_le32(ei->i_disksize >> 32); + if (disksize != raw_inode->i_size_high) { + raw_inode->i_size_high = disksize; + need_datasync = 1; + } if (ei->i_disksize > 0x7fffffffULL) { struct super_block *sb = inode->i_sb; if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, @@ -3182,6 +3157,8 @@ again: ext3_clear_inode_state(inode, EXT3_STATE_NEW); atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid); + if (need_datasync) + atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid); out_brelse: brelse (bh); ext3_std_error(inode->i_sb, err); @@ -3193,21 +3170,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running - * trasnaction to commit. + * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (for sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -3219,13 +3195,13 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) { - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (ext3_journal_current_handle()) { @@ -3234,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext3_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; return ext3_force_commit(inode->i_sb); @@ -3269,8 +3250,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) if (is_quota_modification(inode, attr)) dquot_initialize(inode); - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) || + (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, @@ -3346,7 +3327,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) mark_inode_dirty(inode); if (ia_valid & ATTR_MODE) - rc = ext3_acl_chmod(inode); + rc = posix_acl_chmod(inode, inode->i_mode); err_out: ext3_std_error(inode->i_sb, error); @@ -3458,14 +3439,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. - * - * Is this efficient/effective? Well, we're being nice to the system - * by cleaning up our inodes proactively so they can be reaped - * without I/O. But we are potentially leaving up to five seconds' - * worth of inodes floating about which prune_icache wants us to - * write out. One way to fix that would be to get prune_icache() - * to do a write_super() to free up some memory. It has the desired - * effect. */ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) { |
