diff options
Diffstat (limited to 'fs')
55 files changed, 2648 insertions, 1118 deletions
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 54f92379272..475f9c597cb 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -61,8 +61,6 @@ do { \ current->pid, __func__, ##args); \ } while (0) -extern spinlock_t autofs4_lock; - /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 1442da4860e..509fe1eb66a 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, return -EBUSY; } else { struct file *pipe = fget(pipefd); + if (!pipe) { + err = -EBADF; + goto out; + } if (!pipe->f_op || !pipe->f_op->write) { err = -EPIPE; fput(pipe); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index f43100b9662..450f529a4ea 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -87,18 +87,70 @@ done: } /* + * Calculate and dget next entry in the subdirs list under root. + */ +static struct dentry *get_next_positive_subdir(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *q; + + spin_lock(&sbi->lookup_lock); + + if (prev == NULL) { + spin_lock(&root->d_lock); + prev = dget_dlock(root); + next = prev->d_subdirs.next; + p = prev; + goto start; + } + + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_u.d_child.next; +start: + if (next == &root->d_subdirs) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + q = list_entry(next, struct dentry, d_u.d_child); + + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(q)) { + spin_unlock(&p->d_lock); + p = q; + goto again; + } + dget_dlock(q); + spin_unlock(&q->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return q; +} + +/* * Calculate and dget next entry in top down tree traversal. */ static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { + struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; if (prev == NULL) return dget(root); - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); relock: p = prev; spin_lock(&p->d_lock); @@ -110,7 +162,7 @@ again: if (p == root) { spin_unlock(&p->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); dput(prev); return NULL; } @@ -140,7 +192,7 @@ again: dget_dlock(ret); spin_unlock(&ret->d_lock); spin_unlock(&p->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); dput(prev); @@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(root); /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) { - spin_unlock(&sbi->fs_lock); - return NULL; - } - managed_dentry_set_transit(root); + if (ino->flags & AUTOFS_INF_PENDING) + goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { struct autofs_info *ino = autofs4_dentry_ino(root); ino->flags |= AUTOFS_INF_EXPIRING; @@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, spin_unlock(&sbi->fs_lock); return root; } - managed_dentry_clear_transit(root); +out: spin_unlock(&sbi->fs_lock); dput(root); @@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, timeout = sbi->exp_timeout; dentry = NULL; - while ((dentry = get_next_positive_dentry(dentry, root))) { + while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) - goto cont; - managed_dentry_set_transit(dentry); + goto next; /* * Case 1: (i) indirect mount or top level pseudo direct mount @@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, } } next: - managed_dentry_clear_transit(dentry); -cont: spin_unlock(&sbi->fs_lock); } return NULL; @@ -415,13 +461,13 @@ found: ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); spin_lock(&expired->d_parent->d_lock); spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&expired->d_lock); spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return expired; } @@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); ino->flags &= ~AUTOFS_INF_EXPIRING; - if (!d_unhashed(dentry)) - managed_dentry_clear_transit(dentry); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_EXPIRING; spin_lock(&dentry->d_lock); - if (ret) - __managed_dentry_clear_transit(dentry); - else { + if (!ret) { if ((IS_ROOT(dentry) || (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))) && diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e6f84d26f4c..96804a17bbd 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -23,8 +23,6 @@ #include "autofs_i.h" -DEFINE_SPINLOCK(autofs4_lock); - static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); static int autofs4_dir_unlink(struct inode *,struct dentry *); static int autofs4_dir_rmdir(struct inode *,struct dentry *); @@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * autofs file system so just let the libfs routines handle * it. */ - spin_lock(&autofs4_lock); + spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return -ENOENT; } spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); out: return dcache_dir_open(inode, file); @@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->active_list; list_for_each(p, head) { @@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) dget_dlock(active); spin_unlock(&active->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return active; } next: spin_unlock(&active->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return NULL; } @@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) const unsigned char *str = name->name; struct list_head *p, *head; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); head = &sbi->expiring_list; list_for_each(p, head) { @@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) dget_dlock(expiring); spin_unlock(&expiring->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return expiring; } next: spin_unlock(&expiring->d_lock); } spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return NULL; } @@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - int status; + int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); status = autofs4_wait(sbi, dentry, NFY_MOUNT); DPRINTK("mount wait done status=%d", status); - ino->last_used = jiffies; - return status; } - return 0; + ino->last_used = jiffies; + return status; } static int do_expire_wait(struct dentry *dentry) @@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; dput(path->dentry); path->dentry = new; } @@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path) DPRINTK("dentry=%p %.*s", dentry, dentry->d_name.len, dentry->d_name.name); - /* - * Someone may have manually umounted this or it was a submount - * that has gone away. - */ - spin_lock(&dentry->d_lock); - if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { - if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) && - (dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) - __managed_dentry_set_transit(path->dentry); - } - spin_unlock(&dentry->d_lock); - /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) return NULL; @@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path) done: if (!(ino->flags & AUTOFS_INF_EXPIRING)) { /* - * Any needed mounting has been completed and the path updated - * so turn this into a normal dentry so we don't continually - * call ->d_automount() and ->d_manage(). - */ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_transit(dentry); - /* + * Any needed mounting has been completed and the path + * updated so clear DCACHE_NEED_AUTOMOUNT so we don't + * call ->d_automount() on rootless multi-mounts since + * it can lead to an incorrect ELOOP error return. + * * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and * symlinks as in all other cases the dentry will be covered by * an actual mount so ->d_automount() won't be called during * the follow. */ + spin_lock(&dentry->d_lock); if ((!d_mountpoint(dentry) && !list_empty(&dentry->d_subdirs)) || (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) @@ -455,6 +436,8 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { + if (rcu_walk) + return 0; if (!d_mountpoint(dentry)) return -EISDIR; return 0; @@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = CURRENT_TIME; - spin_lock(&autofs4_lock); - autofs4_add_expiring(dentry); + spin_lock(&sbi->lookup_lock); + __autofs4_add_expiring(dentry); spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); return 0; } @@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi)) return -EACCES; - spin_lock(&autofs4_lock); spin_lock(&sbi->lookup_lock); spin_lock(&dentry->d_lock); if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&dentry->d_lock); spin_unlock(&sbi->lookup_lock); - spin_unlock(&autofs4_lock); return -ENOTEMPTY; } __autofs4_add_expiring(dentry); - spin_unlock(&sbi->lookup_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->lookup_lock); if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 56010056b2e..25435987d6a 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -197,12 +197,12 @@ rename_retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); - spin_lock(&autofs4_lock); + spin_lock(&sbi->fs_lock); for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; if (!len || --len > NAME_MAX) { - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->fs_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; @@ -218,7 +218,7 @@ rename_retry: p -= tmp->d_name.len; strncpy(p, tmp->d_name.name, tmp->d_name.len); } - spin_unlock(&autofs4_lock); + spin_unlock(&sbi->fs_lock); rcu_read_unlock(); if (read_seqretry(&rename_lock, seq)) goto rename_retry; diff --git a/fs/block_dev.c b/fs/block_dev.c index 7d02afb2b7f..c1511c674f5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV); static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); inode->i_data.backing_dev_info = dst; if (inode->i_state & I_DIRTY) list_move(&inode->i_wb_list, &dst->wb.b_dirty); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); } static sector_t max_block(struct block_device *bdev) diff --git a/fs/buffer.c b/fs/buffer.c index 2e6b1a387b7..a08bb8e61c6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1138,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, - * mapping->tree_lock and the global inode_lock. + * mapping->tree_lock and mapping->host->i_lock. */ void mark_buffer_dirty(struct buffer_head *bh) { diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 06d27a41807..af56ad56a89 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -61,4 +61,13 @@ void coda_sysctl_clean(void) fs_table_header = NULL; } } + +#else +void coda_sysctl_init(void) +{ +} + +void coda_sysctl_clean(void) +{ +} #endif diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 816f88e6b9c..98b77c89494 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,6 +8,7 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> +#include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; @@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - if (inode->i_mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(toput_inode); } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index adf96b82278..97b970e7dd1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -21,6 +21,8 @@ #include "ext4_jbd2.h" #include "mballoc.h" +#include <trace/events/ext4.h> + /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_read_block_bitmap_load(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index d8b992e658c..e25e99bf7ee 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) return 1; } -static inline void ext4_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) { return ext4_journal_start_sb(inode->i_sb, nblocks); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7516fb9c0bd..dd2cb5076ff 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -44,6 +44,8 @@ #include "ext4_jbd2.h" #include "ext4_extents.h" +#include <trace/events/ext4.h> + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, if (unlikely(!bh)) goto err; if (!bh_uptodate_or_lock(bh)) { + trace_ext4_ext_load_extent(inode, block, + path[ppos].p_block); if (bh_submit_read(bh) < 0) { put_bh(bh); goto err; @@ -1034,7 +1038,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; - ext4_free_blocks(handle, inode, 0, ablocks[i], 1, + ext4_free_blocks(handle, inode, NULL, ablocks[i], 1, EXT4_FREE_BLOCKS_METADATA); } } @@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, if (err) return err; ext_debug("index is empty, remove it, free block %llu\n", leaf); - ext4_free_blocks(handle, inode, 0, leaf, 1, + ext4_free_blocks(handle, inode, NULL, leaf, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return err; } @@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, num = le32_to_cpu(ex->ee_block) + ee_len - from; start = ext4_ext_pblock(ex) + ee_len - num; ext_debug("free last %u blocks starting %llu\n", num, start); - ext4_free_blocks(handle, inode, 0, start, num, flags); + ext4_free_blocks(handle, inode, NULL, start, num, flags); } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", @@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, { int i, depth; struct ext4_extent_header *eh; - struct ext4_extent *ex, *last_ex; + struct ext4_extent *last_ex; if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) return 0; depth = ext_depth(inode); eh = path[depth].p_hdr; - ex = path[depth].p_ext; if (unlikely(!eh->eh_entries)) { EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " @@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock = 0; int err = 0, depth, ret; unsigned int allocated = 0; struct ext4_allocation_request ar; @@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); + trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* check in cache */ if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { @@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = -EIO; goto out2; } - eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex) { @@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), + ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; } @@ -3525,6 +3527,8 @@ out2: ext4_ext_drop_refs(path); kfree(path); } + trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, + newblock, map->m_len, err ? err : allocated); return err ? err : allocated; } @@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* * We can't just convert len to max_blocks because @@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ret = inode_newsize_ok(inode, (len + offset)); if (ret) { mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } retry: @@ -3717,6 +3723,8 @@ retry: goto retry; } mutex_unlock(&inode->i_mutex); + trace_ext4_fallocate_exit(inode, offset, max_blocks, + ret > 0 ? ret2 : ret); return ret > 0 ? ret2 : ret; } @@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } return ret > 0 ? ret2 : ret; } + /* * Callback function called for each extent to gather FIEMAP information. */ @@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; __u64 logical; __u64 physical; __u64 length; + loff_t size; __u32 flags = 0; - int error; + int ret = 0; + struct fiemap_extent_info *fieinfo = data; + unsigned char blksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; + blksize_bits = inode->i_sb->s_blocksize_bits; + logical = (__u64)newex->ec_block << blksize_bits; if (newex->ec_start == 0) { - pgoff_t offset; - struct page *page; + /* + * No extent in extent-tree contains block @newex->ec_start, + * then the block may stay in 1)a hole or 2)delayed-extent. + * + * Holes or delayed-extents are processed as follows. + * 1. lookup dirty pages with specified range in pagecache. + * If no page is got, then there is no delayed-extent and + * return with EXT_CONTINUE. + * 2. find the 1st mapped buffer, + * 3. check if the mapped buffer is both in the request range + * and a delayed buffer. If not, there is no delayed-extent, + * then return. + * 4. a delayed-extent is found, the extent will be collected. + */ + ext4_lblk_t end = 0; + pgoff_t last_offset; + pgoff_t offset; + pgoff_t index; + struct page **pages = NULL; struct buffer_head *bh = NULL; + struct buffer_head *head = NULL; + unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); + + pages = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; offset = logical >> PAGE_SHIFT; - page = find_get_page(inode->i_mapping, offset); - if (!page || !page_has_buffers(page)) - return EXT_CONTINUE; +repeat: + last_offset = offset; + head = NULL; + ret = find_get_pages_tag(inode->i_mapping, &offset, + PAGECACHE_TAG_DIRTY, nr_pages, pages); + + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* First time, try to find a mapped buffer. */ + if (ret == 0) { +out: + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + /* just a hole. */ + kfree(pages); + return EXT_CONTINUE; + } - bh = page_buffers(page); + /* Try to find the 1st mapped buffer. */ + end = ((__u64)pages[0]->index << PAGE_SHIFT) >> + blksize_bits; + if (!page_has_buffers(pages[0])) + goto out; + head = page_buffers(pages[0]); + if (!head) + goto out; - if (!bh) - return EXT_CONTINUE; + bh = head; + do { + if (buffer_mapped(bh)) { + /* get the 1st mapped buffer. */ + if (end > newex->ec_block + + newex->ec_len) + /* The buffer is out of + * the request range. + */ + goto out; + goto found_mapped_buffer; + } + bh = bh->b_this_page; + end++; + } while (bh != head); - if (buffer_delay(bh)) { - flags |= FIEMAP_EXTENT_DELALLOC; - page_cache_release(page); + /* No mapped buffer found. */ + goto out; } else { - page_cache_release(page); - return EXT_CONTINUE; + /*Find contiguous delayed buffers. */ + if (ret > 0 && pages[0]->index == last_offset) + head = page_buffers(pages[0]); + bh = head; } + +found_mapped_buffer: + if (bh != NULL && buffer_delay(bh)) { + /* 1st or contiguous delayed buffer found. */ + if (!(flags & FIEMAP_EXTENT_DELALLOC)) { + /* + * 1st delayed buffer found, record + * the start of extent. + */ + flags |= FIEMAP_EXTENT_DELALLOC; + newex->ec_block = end; + logical = (__u64)end << blksize_bits; + } + /* Find contiguous delayed buffers. */ + do { + if (!buffer_delay(bh)) + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + + for (index = 1; index < ret; index++) { + if (!page_has_buffers(pages[index])) { + bh = NULL; + break; + } + head = page_buffers(pages[index]); + if (!head) { + bh = NULL; + break; + } + if (pages[index]->index != + pages[0]->index + index) { + /* Blocks are not contiguous. */ + bh = NULL; + break; + } + bh = head; + do { + if (!buffer_delay(bh)) + /* Delayed-extent ends. */ + goto found_delayed_extent; + bh = bh->b_this_page; + end++; + } while (bh != head); + } + } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) + /* a hole found. */ + goto out; + +found_delayed_extent: + newex->ec_len = min(end - newex->ec_block, + (ext4_lblk_t)EXT_INIT_MAX_LEN); + if (ret == nr_pages && bh != NULL && + newex->ec_len < EXT_INIT_MAX_LEN && + buffer_delay(bh)) { + /* Have not collected an extent and continue. */ + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + goto repeat; + } + + for (index = 0; index < ret; index++) + page_cache_release(pages[index]); + kfree(pages); } physical = (__u64)newex->ec_start << blksize_bits; @@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; - /* - * If this extent reaches EXT_MAX_BLOCK, it must be last. - * - * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, - * this also indicates no more allocated blocks. - * - * XXX this might miss a single-block extent at EXT_MAX_BLOCK - */ - if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || - newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { - loff_t size = i_size_read(inode); - loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); - + size = i_size_read(inode); + if (logical + length >= size) flags |= FIEMAP_EXTENT_LAST; - if ((flags & FIEMAP_EXTENT_DELALLOC) && - logical+length > size) - length = (size - logical + bs - 1) & ~(bs-1); - } - error = fiemap_fill_next_extent(fieinfo, logical, physical, + ret = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); - if (error < 0) - return error; - if (error == 1) + if (ret < 0) + return ret; + if (ret == 1) return EXT_BREAK; - return EXT_CONTINUE; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 7829b287822..7f74019d6d7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -164,20 +164,20 @@ int ext4_sync_file(struct file *file, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); - trace_ext4_sync_file(file, datasync); + trace_ext4_sync_file_enter(file, datasync); if (inode->i_sb->s_flags & MS_RDONLY) return 0; ret = ext4_flush_completed_IO(inode); if (ret < 0) - return ret; + goto out; if (!journal) { ret = generic_file_fsync(file, datasync); if (!ret && !list_empty(&inode->i_dentry)) ext4_sync_parent(inode); - return ret; + goto out; } /* @@ -194,8 +194,10 @@ int ext4_sync_file(struct file *file, int datasync) * (they were dirtied by commit). But that's OK - the blocks are * safe in-journal, which is all fsync() needs to ensure. */ - if (ext4_should_journal_data(inode)) - return ext4_force_commit(inode->i_sb); + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; if (jbd2_log_start_commit(journal, commit_tid)) { @@ -215,5 +217,7 @@ int ext4_sync_file(struct file *file, int datasync) ret = jbd2_log_wait_commit(journal, commit_tid); } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + out: + trace_ext4_sync_file_exit(inode, ret); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 78b79e1bd7e..21bb2f61e50 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * We do it here so the bitmap uptodate bit * get set with buffer lock held. */ + trace_ext4_load_inode_bitmap(sb, block_group); set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); @@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, *group = parent_group + flex_size; if (*group > ngroups) *group = 0; - return find_group_orlov(sb, parent, group, mode, 0); + return find_group_orlov(sb, parent, group, mode, NULL); } /* @@ -1054,6 +1055,11 @@ got: } } + if (ext4_handle_valid(handle)) { + ei->i_sync_tid = handle->h_transaction->t_tid; + ei->i_datasync_tid = handle->h_transaction->t_tid; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9297ad46c46..1a86282b902 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); up_write(&EXT4_I(inode)->i_data_sem); - ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + ret = ext4_journal_restart(handle, nblocks); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -720,7 +720,7 @@ allocated: return ret; failed_out: for (i = 0; i < index; i++) - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); return ret; } @@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, return err; failed: /* Allocation failed, free what we already allocated */ - ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); for (i = 1; i <= n ; i++) { /* * branch[i].bh is newly allocated, so there is no * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, EXT4_FREE_BLOCKS_FORGET); } for (i = n+1; i < indirect_blks; i++) - ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); - ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); + ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); return err; } @@ -924,7 +924,7 @@ err_out: ext4_free_blocks(handle, inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), blks, 0); return err; @@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, int count = 0; ext4_fsblk_t first_block = 0; + trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, map->m_lblk, offsets, @@ -1058,6 +1059,8 @@ cleanup: partial--; } out: + trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, + map->m_pblk, map->m_len, err); return err; } @@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { - int commit_write = 0, redirty_page = 0; + int commit_write = 0, skip_page = 0; struct page *page = pvec.pages[i]; index = page->index; @@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, * If the page does not have buffers (for * whatever reason), try to create them using * __block_write_begin. If this fails, - * redirty the page and move on. + * skip the page and move on. */ if (!page_has_buffers(page)) { if (__block_write_begin(page, 0, len, noalloc_get_block_write)) { - redirty_page: - redirty_page_for_writepage(mpd->wbc, - page); + skip_page: unlock_page(page); continue; } @@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, block_start = 0; do { if (!bh) - goto redirty_page; + goto skip_page; if (map && (cur_logical >= map->m_lblk) && (cur_logical <= (map->m_lblk + (map->m_len - 1)))) { @@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_unwritten(bh); } - /* redirty page if block allocation undone */ + /* skip page if block allocation undone */ if (buffer_delay(bh) || buffer_unwritten(bh)) - redirty_page = 1; + skip_page = 1; bh = bh->b_this_page; block_start += bh->b_size; cur_logical++; pblock++; } while (bh != page_bufs); - if (redirty_page) - goto redirty_page; + if (skip_page) + goto skip_page; if (commit_write) /* mark the buffer_heads as dirty & uptodate */ block_commit_write(page, 0, len); + clear_page_dirty_for_io(page); /* * Delalloc doesn't support data journalling, * but eventually maybe we'll lift this @@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, return ret; } -static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, - sector_t logical, long blk_cnt) +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) { int nr_pages, i; pgoff_t index, end; @@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; - index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (logical + blk_cnt - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); + index = mpd->first_page; + end = mpd->next_page - 1; while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) err = blks; /* * If get block returns EAGAIN or ENOSPC and there - * appears to be free blocks we will call - * ext4_writepage() for all of the pages which will - * just redirty the pages. + * appears to be free blocks we will just let + * mpage_da_submit_io() unlock all of the pages. */ if (err == -EAGAIN) goto submit_io; @@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) ext4_print_free_blocks(mpd->inode); } /* invalidate all the pages */ - ext4_da_block_invalidatepages(mpd, next, - mpd->b_size >> mpd->inode->i_blkbits); + ext4_da_block_invalidatepages(mpd); + + /* Mark this page range as having been completed */ + mpd->io_done = 1; return; } BUG_ON(blks == 0); @@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) } /* - * __mpage_da_writepage - finds extent of pages and blocks - * - * @page: page to consider - * @wbc: not used, we just follow rules - * @data: context - * - * The function finds extents of pages and scan them for all blocks. - */ -static int __mpage_da_writepage(struct page *page, - struct writeback_control *wbc, - struct mpage_da_data *mpd) -{ - struct inode *inode = mpd->inode; - struct buffer_head *bh, *head; - sector_t logical; - - /* - * Can we merge this page to current extent? - */ - if (mpd->next_page != page->index) { - /* - * Nope, we can't. So, we map non-allocated blocks - * and start IO on them - */ - if (mpd->next_page != mpd->first_page) { - mpage_da_map_and_submit(mpd); - /* - * skip rest of the page in the page_vec - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return MPAGE_DA_EXTENT_TAIL; - } - - /* - * Start next extent of pages ... - */ - mpd->first_page = page->index; - - /* - * ... and blocks - */ - mpd->b_size = 0; - mpd->b_state = 0; - mpd->b_blocknr = 0; - } - - mpd->next_page = page->index + 1; - logical = (sector_t) page->index << - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - if (!page_has_buffers(page)) { - mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - if (mpd->io_done) - return MPAGE_DA_EXTENT_TAIL; - } else { - /* - * Page with regular buffer heads, just add all dirty ones - */ - head = page_buffers(page); - bh = head; - do { - BUG_ON(buffer_locked(bh)); - /* - * We need to try to allocate - * unmapped blocks in the same page. - * Otherwise we won't make progress - * with the page in ext4_writepage - */ - if (ext4_bh_delay_or_unwritten(NULL, bh)) { - mpage_add_bh_to_extent(mpd, logical, - bh->b_size, - bh->b_state); - if (mpd->io_done) - return MPAGE_DA_EXTENT_TAIL; - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { - /* - * mapped dirty buffer. We need to update - * the b_state because we look at - * b_state in mpage_da_map_blocks. We don't - * update b_size because if we find an - * unmapped buffer_head later we need to - * use the b_state flag of that buffer_head. - */ - if (mpd->b_size == 0) - mpd->b_state = bh->b_state & BH_FLAGS; - } - logical++; - } while ((bh = bh->b_this_page) != head); - } - - return 0; -} - -/* * This is a special get_blocks_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. @@ -2597,7 +2502,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * for partial write. */ set_buffer_new(bh); - set_buffer_mapped(bh); } return 0; } @@ -2811,27 +2715,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) /* * write_cache_pages_da - walk the list of dirty pages of the given - * address space and call the callback function (which usually writes - * the pages). - * - * This is a forked version of write_cache_pages(). Differences: - * Range cyclic is ignored. - * no_nrwrite_index_update is always presumed true + * address space and accumulate pages that need writing, and call + * mpage_da_map_and_submit to map a single contiguous memory region + * and then write them. */ static int write_cache_pages_da(struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) { - int ret = 0; - int done = 0; - struct pagevec pvec; - unsigned nr_pages; - pgoff_t index; - pgoff_t end; /* Inclusive */ - long nr_to_write = wbc->nr_to_write; - int tag; - + struct buffer_head *bh, *head; + struct inode *inode = mapping->host; + struct pagevec pvec; + unsigned int nr_pages; + sector_t logical; + pgoff_t index, end; + long nr_to_write = wbc->nr_to_write; + int i, tag, ret = 0; + + memset(mpd, 0, sizeof(struct mpage_da_data)); + mpd->wbc = wbc; + mpd->inode = inode; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; @@ -2842,13 +2746,11 @@ static int write_cache_pages_da(struct address_space *mapping, tag = PAGECACHE_TAG_DIRTY; *done_index = index; - while (!done && (index <= end)) { - int i; - + while (index <= end) { nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) - break; + return 0; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2860,60 +2762,100 @@ static int write_cache_pages_da(struct address_space *mapping, * mapping. However, page->index will not change * because we have a reference on the page. */ - if (page->index > end) { - done = 1; - break; - } + if (page->index > end) + goto out; *done_index = page->index + 1; + /* + * If we can't merge this page, and we have + * accumulated an contiguous region, write it + */ + if ((mpd->next_page != page->index) && + (mpd->next_page != mpd->first_page)) { + mpage_da_map_and_submit(mpd); + goto ret_extent_tail; + } + lock_page(page); /* - * Page truncated or invalidated. We can freely skip it - * then, even for data integrity operations: the page - * has disappeared concurrently, so there could be no - * real expectation of this data interity operation - * even if there is now a new, dirty page at the same - * pagecache address. + * If the page is no longer dirty, or its + * mapping no longer corresponds to inode we + * are writing (which means it has been + * truncated or invalidated), or the page is + * already under writeback and we are not + * doing a data integrity writeback, skip the page */ - if (unlikely(page->mapping != mapping)) { -continue_unlock: + if (!PageDirty(page) || + (PageWriteback(page) && + (wbc->sync_mode == WB_SYNC_NONE)) || + unlikely(page->mapping != mapping)) { unlock_page(page); continue; } - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } - - if (PageWriteback(page)) { - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - else - goto continue_unlock; - } + if (PageWriteback(page)) + wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); - if (!clear_page_dirty_for_io(page)) - goto continue_unlock; - ret = __mpage_da_writepage(page, wbc, mpd); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done = 1; - break; - } + if (mpd->next_page != page->index) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + logical = (sector_t) page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + + if (!page_has_buffers(page)) { + mpage_add_bh_to_extent(mpd, logical, + PAGE_CACHE_SIZE, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + if (mpd->io_done) + goto ret_extent_tail; + } else { + /* + * Page with regular buffer heads, + * just add all dirty ones + */ + head = page_buffers(page); + bh = head; + do { + BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_writepage + */ + if (ext4_bh_delay_or_unwritten(NULL, bh)) { + mpage_add_bh_to_extent(mpd, logical, + bh->b_size, + bh->b_state); + if (mpd->io_done) + goto ret_extent_tail; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need + * to update the b_state + * because we look at b_state + * in mpage_da_map_blocks. We + * don't update b_size because + * if we find an unmapped + * buffer_head later we need to + * use the b_state flag of that + * buffer_head. + */ + if (mpd->b_size == 0) + mpd->b_state = bh->b_state & BH_FLAGS; + } + logical++; + } while ((bh = bh->b_this_page) != head); } if (nr_to_write > 0) { nr_to_write--; if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { + wbc->sync_mode == WB_SYNC_NONE) /* * We stop writing back only if we are * not doing integrity sync. In case of @@ -2924,14 +2866,18 @@ continue_unlock: * pages, but have not synced all of the * old dirty pages. */ - done = 1; - break; - } + goto out; } } pagevec_release(&pvec); cond_resched(); } + return 0; +ret_extent_tail: + ret = MPAGE_DA_EXTENT_TAIL; +out: + pagevec_release(&pvec); + cond_resched(); return ret; } @@ -2945,7 +2891,6 @@ static int ext4_da_writepages(struct address_space *mapping, struct mpage_da_data mpd; struct inode *inode = mapping->host; int pages_written = 0; - long pages_skipped; unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0; @@ -3028,11 +2973,6 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = desired_nr_to_write; } - mpd.wbc = wbc; - mpd.inode = mapping->host; - - pages_skipped = wbc->pages_skipped; - retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -3059,22 +2999,10 @@ retry: } /* - * Now call __mpage_da_writepage to find the next + * Now call write_cache_pages_da() to find the next * contiguous region of logical blocks that need - * blocks to be allocated by ext4. We don't actually - * submit the blocks for I/O here, even though - * write_cache_pages thinks it will, and will set the - * pages as clean for write before calling - * __mpage_da_writepage(). + * blocks to be allocated by ext4 and submit them. */ - mpd.b_size = 0; - mpd.b_state = 0; - mpd.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.io_done = 0; - mpd.pages_written = 0; - mpd.retval = 0; ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we @@ -3096,7 +3024,6 @@ retry: * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); - wbc->pages_skipped = pages_skipped; ret = 0; } else if (ret == MPAGE_DA_EXTENT_TAIL) { /* @@ -3104,7 +3031,6 @@ retry: * rest of the pages */ pages_written += mpd.pages_written; - wbc->pages_skipped = pages_skipped; ret = 0; io_done = 1; } else if (wbc->nr_to_write) @@ -3122,11 +3048,6 @@ retry: wbc->range_end = mapping->writeback_index - 1; goto retry; } - if (pages_skipped != wbc->pages_skipped) - ext4_msg(inode->i_sb, KERN_CRIT, - "This should not happen leaving %s " - "with nr_to_write = %ld ret = %d", - __func__, wbc->nr_to_write, ret); /* Update index */ wbc->range_cyclic = range_cyclic; @@ -3460,6 +3381,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + trace_ext4_readpage(page); return mpage_readpage(page, ext4_get_block); } @@ -3494,6 +3416,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_invalidatepage(page, offset); + /* * free any io_end structure allocated for buffers to be discarded */ @@ -3515,6 +3439,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) { journal_t *journal = EXT4_JOURNAL(page->mapping->host); + trace_ext4_releasepage(page); + WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; @@ -3873,11 +3799,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + ssize_t ret; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); - - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + else + ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + trace_ext4_direct_IO_exit(inode, offset, + iov_length(iov, nr_segs), rw, ret); + return ret; } /* @@ -4173,6 +4104,9 @@ no_top: * * We release `count' blocks on disk, but (last - first) may be greater * than `count' because there can be holes in there. + * + * Return 0 on success, 1 on invalid block range + * and < 0 on fatal error. */ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, @@ -4199,33 +4133,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, if (bh) { BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; } err = ext4_mark_inode_dirty(handle, inode); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; err = ext4_truncate_restart_trans(handle, inode, blocks_for_truncate(inode)); - if (unlikely(err)) { - ext4_std_error(inode->i_sb, err); - return 1; - } + if (unlikely(err)) + goto out_err; if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out_err; } } for (p = first; p < last; p++) *p = 0; - ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); + ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); return 0; +out_err: + ext4_std_error(inode->i_sb, err); + return err; } /** @@ -4259,7 +4192,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, ext4_fsblk_t nr; /* Current block # */ __le32 *p; /* Pointer into inode/ind for current block */ - int err; + int err = 0; if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); @@ -4281,9 +4214,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } else if (nr == block_to_free + count) { count++; } else { - if (ext4_clear_blocks(handle, inode, this_bh, - block_to_free, count, - block_to_free_p, p)) + err = ext4_clear_blocks(handle, inode, this_bh, + block_to_free, count, + block_to_free_p, p); + if (err) break; block_to_free = nr; block_to_free_p = p; @@ -4292,9 +4226,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, } } - if (count > 0) - ext4_clear_blocks(handle, inode, this_bh, block_to_free, - count, block_to_free_p, p); + if (!err && count > 0) + err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + if (err < 0) + /* fatal error */ + return; if (this_bh) { BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); @@ -4412,7 +4349,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * transaction where the data blocks are * actually freed. */ - ext4_free_blocks(handle, inode, 0, nr, 1, + ext4_free_blocks(handle, inode, NULL, nr, 1, EXT4_FREE_BLOCKS_METADATA| EXT4_FREE_BLOCKS_FORGET); @@ -4496,6 +4433,8 @@ void ext4_truncate(struct inode *inode) ext4_lblk_t last_block; unsigned blocksize = inode->i_sb->s_blocksize; + trace_ext4_truncate_enter(inode); + if (!ext4_can_truncate(inode)) return; @@ -4506,6 +4445,7 @@ void ext4_truncate(struct inode *inode) if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ext4_ext_truncate(inode); + trace_ext4_truncate_exit(inode); return; } @@ -4635,6 +4575,7 @@ out_stop: ext4_orphan_del(handle, inode); ext4_journal_stop(handle); + trace_ext4_truncate_exit(inode); } /* @@ -4766,6 +4707,7 @@ make_io: * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ + trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ_META, bh); @@ -4871,7 +4813,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); - iloc.bh = 0; + iloc.bh = NULL; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a84faa110bc..808c554e773 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -334,16 +334,22 @@ mext_out: case FITRIM: { struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + if (copy_from_user(&range, (struct fstrim_range *)arg, sizeof(range))) return -EFAULT; + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; @@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: + case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d1fe09aea73..a5837a837a8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -432,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) } /* at order 0 we see each particular block */ - *max = 1 << (e4b->bd_blkbits + 3); - if (order == 0) + if (order == 0) { + *max = 1 << (e4b->bd_blkbits + 3); return EXT4_MB_BITMAP(e4b); + } bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; @@ -616,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); - buddy = mb_find_buddy(e4b, 0, &max); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; @@ -635,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, #define mb_check_buddy(e4b) #endif -/* FIXME!! need more doc */ +/* + * Divide blocks started from @first with length @len into + * smaller chunks with power of 2 blocks. + * Clear the bits in bitmap which the blocks of the chunk(s) covered, + * then increase bb_counters[] for corresponded chunk size. + */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) @@ -2381,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb) /* An 8TB filesystem with 64-bit pointers requires a 4096 byte * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. * So a two level scheme suffices for now. */ - sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); + sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); if (sbi->s_group_info == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); return -ENOMEM; @@ -3208,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); - if (cur_distance < new_distance) + if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ @@ -3907,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; - if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + if (!mb_enable_debug || + (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; printk(KERN_ERR "EXT4-fs: Can't allocate:" @@ -4753,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count, * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. This is done until whole group is scanned. */ -ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { void *bitmap; @@ -4863,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) break; } - if (len >= EXT4_BLOCKS_PER_GROUP(sb)) - len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); - else + /* + * For all the groups except the last one, last block will + * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to + * change it for the last group in which case start + + * len < EXT4_BLOCKS_PER_GROUP(sb). + */ + if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) last_block = first_block + len; + len -= last_block - first_block; if (e4b.bd_info->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, &e4b, first_block, diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b619322c76f..22bd4d7f289 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -169,7 +169,7 @@ struct ext4_allocation_context { /* original request */ struct ext4_free_extent ac_o_ex; - /* goal request (after normalization) */ + /* goal request (normalized ac_o_ex) */ struct ext4_free_extent ac_g_ex; /* the best found extent */ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b0a126f23c2..d1bafa57f48 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle, for (i = 0; i < max_entries; i++) { if (tmp_idata[i]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(tmp_idata[i]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; @@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return 0; @@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) /* ei->i_data[EXT4_IND_BLOCK] */ if (i_data[0]) { extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, + ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data[0]), 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); @@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, } put_bh(bh); extend_credit_for_blkdel(handle, inode); - ext4_free_blocks(handle, inode, 0, block, 1, + ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); return retval; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e781b7ea563..67fd0b02585 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -40,6 +40,7 @@ #include "xattr.h" #include "acl.h" +#include <trace/events/ext4.h> /* * define how far ahead to read directories while searching them. */ @@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle; + trace_ext4_unlink_enter(dir, dentry); /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); @@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) end_unlink: ext4_journal_stop(handle); brelse(bh); + trace_ext4_unlink_exit(dentry, retval); return retval; } @@ -2402,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_inode && new_dir != old_dir && EXT4_DIR_LINK_MAX(new_dir)) goto end_rename; + BUFFER_TRACE(dir_bh, "get_write_access"); + retval = ext4_journal_get_write_access(handle, dir_bh); + if (retval) + goto end_rename; } if (!new_bh) { retval = ext4_add_entry(handle, new_dentry, old_inode); @@ -2409,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } else { BUFFER_TRACE(new_bh, "get write access"); - ext4_journal_get_write_access(handle, new_bh); + retval = ext4_journal_get_write_access(handle, new_bh); + if (retval) + goto end_rename; new_de->inode = cpu_to_le32(old_inode->i_ino); if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) @@ -2470,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - BUFFER_TRACE(dir_bh, "get_write_access"); - ext4_journal_get_write_access(handle, dir_bh); PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e2cd90e4bb7..b6dbd056fcb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -259,6 +259,11 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + ext4_free_io_end(io_end); + return; + } + /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); @@ -279,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io) BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } - io->io_bio = 0; + io->io_bio = NULL; io->io_op = 0; - io->io_end = 0; + io->io_end = NULL; } static int io_submit_init(struct ext4_io_submit *io, @@ -380,8 +385,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - set_page_writeback(page); - ClearPageError(page); io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); if (!io_page) { @@ -392,6 +395,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, io_page->p_page = page; atomic_set(&io_page->p_count, 1); get_page(page); + set_page_writeback(page); + ClearPageError(page); for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3ecc6e45d2f..80bbc9c60c2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb, } /* Zero out all of the reserved backup group descriptor table blocks */ - ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, GFP_NOFS); @@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb, /* Zero out all of the inode table blocks */ block = input->inode_table; - ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", + ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", block, sbi->s_itb_per_group); err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) @@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - /* ext4_journal_release_buffer(handle, iloc.bh); */ + /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: - /* ext4_journal_release_buffer(handle, dind); */ + /* ext4_handle_release_buffer(handle, dind); */ exit_sbh: - /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ + /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* int j; for (j = 0; j < i; j++) - ext4_journal_release_buffer(handle, primary[j]); + ext4_handle_release_buffer(handle, primary[j]); */ goto exit_bh; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 203f9e4a70b..22546ad7f0a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -54,9 +54,9 @@ static struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; -struct ext4_lazy_init *ext4_li_info; -struct mutex ext4_li_mtx; -struct ext4_features *ext4_feat; +static struct ext4_lazy_init *ext4_li_info; +static struct mutex ext4_li_mtx; +static struct ext4_features *ext4_feat; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); @@ -75,6 +75,7 @@ static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); @@ -594,7 +595,7 @@ __acquires(bitlock) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); @@ -997,13 +998,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, OLDALLOC)) seq_puts(seq, ",oldalloc"); #ifdef CONFIG_EXT4_FS_XATTR - if (test_opt(sb, XATTR_USER) && - !(def_mount_opts & EXT4_DEFM_XATTR_USER)) + if (test_opt(sb, XATTR_USER)) seq_puts(seq, ",user_xattr"); - if (!test_opt(sb, XATTR_USER) && - (def_mount_opts & EXT4_DEFM_XATTR_USER)) { + if (!test_opt(sb, XATTR_USER)) seq_puts(seq, ",nouser_xattr"); - } #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) @@ -1041,8 +1039,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); - if (test_opt(sb, MBLK_IO_SUBMIT)) - seq_puts(seq, ",mblk_io_submit"); + if (!test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",nomblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1451,7 +1449,7 @@ static int parse_options(char *options, struct super_block *sb, * Initialize args struct so we know whether arg was * found; some options take optional arguments. */ - args[0].to = args[0].from = 0; + args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: @@ -1771,7 +1769,7 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; - if (!is_power_of_2(option)) { + if (option && !is_power_of_2(option)) { ext4_msg(sb, KERN_ERR, "EXT4-fs: inode_readahead_blks" " must be a power of 2"); @@ -2120,6 +2118,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, return; } + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " @@ -2412,7 +2417,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, if (parse_strtoul(buf, 0x40000000, &t)) return -EINVAL; - if (!is_power_of_2(t)) + if (t && !is_power_of_2(t)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -3095,14 +3100,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); + /* xattr user namespace & acls are now defaulted on */ #ifdef CONFIG_EXT4_FS_XATTR - if (def_mount_opts & EXT4_DEFM_XATTR_USER) - set_opt(sb, XATTR_USER); + set_opt(sb, XATTR_USER); #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL - if (def_mount_opts & EXT4_DEFM_ACL) - set_opt(sb, POSIX_ACL); + set_opt(sb, POSIX_ACL); #endif + set_opt(sb, MBLK_IO_SUBMIT); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) @@ -3516,7 +3521,7 @@ no_journal: * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->dio_unwritten_wq = - alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM, 1); + alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); goto failed_mount_wq; @@ -3531,17 +3536,16 @@ no_journal: if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); + root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { - iput(root); ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); - iput(root); ret = -ENOMEM; goto failed_mount4; } @@ -3657,6 +3661,8 @@ cantfind_ext4: goto failed_mount; failed_mount4: + iput(root); + sb->s_root = NULL; ext4_msg(sb, KERN_ERR, "mount failed"); destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index fc32176eee3..b545ca1c459 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - jbd2_journal_release_buffer(handle, bs->bh); + ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; @@ -833,7 +833,7 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: - ext4_free_blocks(handle, inode, 0, block, 1, + ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); error = -EIO; goto cleanup; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 59c6e495678..b5ed541fb13 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) } /* + * Remove the inode from the writeback list it is on. + */ +void inode_wb_list_del(struct inode *inode) +{ + spin_lock(&inode_wb_list_lock); + list_del_init(&inode->i_wb_list); + spin_unlock(&inode_wb_list_lock); +} + + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode) { struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + assert_spin_locked(&inode_wb_list_lock); list_move(&inode->i_wb_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { /* - * Prevent speculative execution through spin_unlock(&inode_lock); + * Prevent speculative execution through + * spin_unlock(&inode_wb_list_lock); */ + smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } @@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue, */ static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { + assert_spin_locked(&inode_wb_list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } @@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode) wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); - while (inode->i_state & I_SYNC) { - spin_unlock(&inode_lock); + while (inode->i_state & I_SYNC) { + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); } } /* - * Write out an inode's dirty pages. Called under inode_lock. Either the - * caller has ref on the inode (either via __iget or via syscall against an fd) - * or the inode has I_WILL_FREE set (via generic_forget_inode) + * Write out an inode's dirty pages. Called under inode_wb_list_lock and + * inode->i_lock. Either the caller has an active reference on the inode or + * the inode has I_WILL_FREE set. * * If `wait' is set, wait on the writeout. * * The whole writeout design is quite complex and fragile. We want to avoid * starvation of particular inodes when others are being redirtied, prevent * livelocks, etc. - * - * Called under inode_lock. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) unsigned dirty; int ret; + assert_spin_locked(&inode_wb_list_lock); + assert_spin_locked(&inode->i_lock); + if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else @@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* Set I_SYNC, reset I_DIRTY_PAGES */ inode->i_state |= I_SYNC; inode->i_state &= ~I_DIRTY_PAGES; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); ret = do_writepages(mapping, wbc); @@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * due to delalloc, clear dirty metadata flags right before * write_inode() */ - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { int err = write_inode(inode, wbc); @@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ret = err; } - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { @@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * kind does not need peridic writeout yet, and for the latter * kind writeout is handled by the freer. */ + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); requeue_io(inode); continue; } @@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, wbc->wb_start)) + if (inode_dirtied_after(inode, wbc->wb_start)) { + spin_unlock(&inode->i_lock); return 1; + } __iget(inode); + pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); if (wbc->pages_skipped != pages_skipped) { @@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, */ redirty_tail(inode); } - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); iput(inode); cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; return 1; @@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (!wbc->wb_start) wbc->wb_start = jiffies; /* livelock avoidance */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb, if (ret) break; } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); /* Leave any unwritten inodes on b_io */ } @@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); writeback_sb_inodes(sb, wb, wbc, true); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } /* @@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); if (!list_empty(&wb->b_more_io)) { inode = wb_inode(wb->b_more_io.prev); trace_wbc_writeback_wait(&wbc, wb->bdi); + spin_lock(&inode->i_lock); inode_wait_for_writeback(inode); + spin_unlock(&inode->i_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } return wrote; @@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; - bool wakeup_bdi = false; /* * Don't do this for I_DIRTY_PAGES - that doesn't actually @@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; @@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) * superblock list, based upon its state. */ if (inode->i_state & I_SYNC) - goto out; + goto out_unlock_inode; /* * Only add valid (hashed) inodes to the superblock's @@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) - goto out; + goto out_unlock_inode; } if (inode->i_state & I_FREEING) - goto out; + goto out_unlock_inode; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + bool wakeup_bdi = false; bdi = inode_to_bdi(inode); if (bdi_cap_writeback_dirty(bdi)) { @@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } + spin_unlock(&inode->i_lock); + spin_lock(&inode_wb_list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + spin_unlock(&inode_wb_list_lock); + + if (wakeup_bdi) + bdi_wakeup_thread_delayed(bdi); + return; } } -out: - spin_unlock(&inode_lock); +out_unlock_inode: + spin_unlock(&inode->i_lock); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); } EXPORT_SYMBOL(__mark_inode_dirty); @@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb) * we still have to wait for that writeout. */ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + struct address_space *mapping = inode->i_mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); continue; + } __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. */ iput(old_inode); old_inode = inode; @@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); } @@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync) wbc.nr_to_write = 0; might_sleep(); - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, &wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); if (sync) inode_sync_wait(inode); return ret; @@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) { int ret; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); + spin_lock(&inode->i_lock); ret = writeback_single_inode(inode, wbc); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_wb_list_lock); return ret; } EXPORT_SYMBOL(sync_inode); diff --git a/fs/inode.c b/fs/inode.c index 0b3da4a7770..5f4e11aaeb5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -26,6 +26,38 @@ #include <linux/posix_acl.h> #include <linux/ima.h> #include <linux/cred.h> +#include "internal.h" + +/* + * inode locking rules. + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * inode_lru_lock protects: + * inode_lru, inode->i_lru + * inode_sb_list_lock protects: + * sb->s_inodes, inode->i_sb_list + * inode_wb_list_lock protects: + * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash + * + * Lock ordering: + * + * inode_sb_list_lock + * inode->i_lock + * inode_lru_lock + * + * inode_wb_list_lock + * inode->i_lock + * + * inode_hash_lock + * inode_sb_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock + */ /* * This is needed for the following functions: @@ -60,6 +92,8 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* * Each inode can be on two separate lists. One is @@ -74,15 +108,10 @@ static unsigned int i_hash_shift __read_mostly; */ static LIST_HEAD(inode_lru); -static struct hlist_head *inode_hashtable __read_mostly; +static DEFINE_SPINLOCK(inode_lru_lock); -/* - * A simple spinlock to protect the list manipulations. - * - * NOTE! You also have to own the lock if you change - * the i_state of an inode while it is in use.. - */ -DEFINE_SPINLOCK(inode_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock); /* * iprune_sem provides exclusion between the icache shrinking and the @@ -137,15 +166,6 @@ int proc_nr_inodes(ctl_table *table, int write, } #endif -static void wake_up_inode(struct inode *inode) -{ - /* - * Prevent speculative execution through spin_unlock(&inode_lock); - */ - smp_mb(); - wake_up_bit(&inode->i_state, __I_NEW); -} - /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -336,7 +356,7 @@ static void init_once(void *foo) } /* - * inode_lock must be held + * inode->i_lock must be held */ void __iget(struct inode *inode) { @@ -354,23 +374,22 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { + spin_lock(&inode_lru_lock); if (list_empty(&inode->i_lru)) { list_add(&inode->i_lru, &inode_lru); inodes_stat.nr_unused++; } + spin_unlock(&inode_lru_lock); } static void inode_lru_list_del(struct inode *inode) { + spin_lock(&inode_lru_lock); if (!list_empty(&inode->i_lru)) { list_del_init(&inode->i_lru); inodes_stat.nr_unused--; } -} - -static inline void __inode_sb_list_add(struct inode *inode) -{ - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_lru_lock); } /** @@ -379,15 +398,17 @@ static inline void __inode_sb_list_add(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_lock); - __inode_sb_list_add(inode); - spin_unlock(&inode_lock); + spin_lock(&inode_sb_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode_sb_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); -static inline void __inode_sb_list_del(struct inode *inode) +static inline void inode_sb_list_del(struct inode *inode) { + spin_lock(&inode_sb_list_lock); list_del_init(&inode->i_sb_list); + spin_unlock(&inode_sb_list_lock); } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -412,24 +433,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); hlist_add_head(&inode->i_hash, b); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(__insert_inode_hash); /** - * __remove_inode_hash - remove an inode from the hash - * @inode: inode to unhash - * - * Remove an inode from the superblock. - */ -static void __remove_inode_hash(struct inode *inode) -{ - hlist_del_init(&inode->i_hash); -} - -/** * remove_inode_hash - remove an inode from the hash * @inode: inode to unhash * @@ -437,9 +449,11 @@ static void __remove_inode_hash(struct inode *inode) */ void remove_inode_hash(struct inode *inode) { - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); } EXPORT_SYMBOL(remove_inode_hash); @@ -456,10 +470,29 @@ void end_writeback(struct inode *inode) } EXPORT_SYMBOL(end_writeback); +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ static void evict(struct inode *inode) { const struct super_operations *op = inode->i_sb->s_op; + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + inode_wb_list_del(inode); + inode_sb_list_del(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { @@ -471,6 +504,15 @@ static void evict(struct inode *inode) bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); } /* @@ -489,14 +531,6 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); - - spin_lock(&inode_lock); - __remove_inode_hash(inode); - __inode_sb_list_del(inode); - spin_unlock(&inode_lock); - - wake_up_inode(inode); - destroy_inode(inode); } } @@ -514,25 +548,23 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ - list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -561,31 +593,30 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & I_DIRTY && !kill_dirty) { + spin_unlock(&inode->i_lock); busy = 1; continue; } if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); busy = 1; continue; } inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ - list_move(&inode->i_lru, &dispose); - list_del_init(&inode->i_wb_list); - if (!(inode->i_state & (I_DIRTY | I_SYNC))) - inodes_stat.nr_unused--; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); dispose_list(&dispose); @@ -607,7 +638,7 @@ static int can_unuse(struct inode *inode) /* * Scan `goal' inodes on the unused list for freeable ones. They are moved to a - * temporary list and then are freed outside inode_lock by dispose_list(). + * temporary list and then are freed outside inode_lru_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -628,7 +659,7 @@ static void prune_icache(int nr_to_scan) unsigned long reap = 0; down_read(&iprune_sem); - spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; @@ -638,53 +669,67 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_lru.prev, struct inode, i_lru); /* + * we are inverting the inode_lru_lock/inode->i_lock here, + * so use a trylock. If we fail to get the lock, just move the + * inode to the back of the list so we don't spin on it. + */ + if (!spin_trylock(&inode->i_lock)) { + list_move(&inode->i_lru, &inode_lru); + continue; + } + + /* * Referenced or dirty inodes are still in use. Give them * another pass through the LRU as we canot reclaim them now. */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); inodes_stat.nr_unused--; continue; } /* recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { - list_move(&inode->i_lru, &inode_lru); inode->i_state &= ~I_REFERENCED; + list_move(&inode->i_lru, &inode_lru); + spin_unlock(&inode->i_lock); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_lru_lock); if (remove_inode_buffers(inode)) reap += invalidate_mapping_pages(&inode->i_data, 0, -1); iput(inode); - spin_lock(&inode_lock); + spin_lock(&inode_lru_lock); if (inode != list_entry(inode_lru.next, struct inode, i_lru)) continue; /* wrong inode or list_empty */ - if (!can_unuse(inode)) + /* avoid lock inversions with trylock */ + if (!spin_trylock(&inode->i_lock)) + continue; + if (!can_unuse(inode)) { + spin_unlock(&inode->i_lock); continue; + } } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + spin_unlock(&inode->i_lock); - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ list_move(&inode->i_lru, &freeable); - list_del_init(&inode->i_wb_list); inodes_stat.nr_unused--; } if (current_is_kswapd()) __count_vm_events(KSWAPD_INODESTEAL, reap); else __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&inode_lock); + spin_unlock(&inode_lru_lock); dispose_list(&freeable); up_read(&iprune_sem); @@ -733,15 +778,21 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_sb != sb) + spin_lock(&inode->i_lock); + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; - if (!test(inode, data)) + } + if (!test(inode, data)) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -759,15 +810,21 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, node, head, i_hash) { - if (inode->i_ino != ino) + spin_lock(&inode->i_lock); + if (inode->i_ino != ino) { + spin_unlock(&inode->i_lock); continue; - if (inode->i_sb != sb) + } + if (inode->i_sb != sb) { + spin_unlock(&inode->i_lock); continue; + } if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } __iget(inode); + spin_unlock(&inode->i_lock); return inode; } return NULL; @@ -827,19 +884,26 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_lock); + spin_lock_prefetch(&inode_sb_list_lock); inode = alloc_inode(sb); if (inode) { - spin_lock(&inode_lock); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = 0; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); } return inode; } EXPORT_SYMBOL(new_inode); +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ void unlock_new_inode(struct inode *inode) { #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -859,51 +923,67 @@ void unlock_new_inode(struct inode *inode) } } #endif - /* - * This is special! We do not need the spinlock when clearing I_NEW, - * because we're guaranteed that nobody else tries to do anything about - * the state of the inode when it is locked, as we just created it (so - * there can be no old holders that haven't tested I_NEW). - * However we must emit the memory barrier so that other CPUs reliably - * see the clearing of I_NEW after the other inode initialisation has - * completed. - */ - smp_mb(); + spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; - wake_up_inode(inode); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); -/* - * This is called without the inode lock held.. Be careful. +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. * - * We no longer cache the sb_flags in i_flags - see fs.h - * -- rmk@arm.uk.linux.org + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. */ -static struct inode *get_new_inode(struct super_block *sb, - struct hlist_head *head, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data) +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) { + struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode(sb, head, test, data); if (!old) { if (set(inode, data)) goto set_failed; - hlist_add_head(&inode->i_hash, head); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = I_NEW; - spin_unlock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -916,7 +996,7 @@ static struct inode *get_new_inode(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); @@ -924,33 +1004,53 @@ static struct inode *get_new_inode(struct super_block *sb, return inode; set_failed: - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); return NULL; } +EXPORT_SYMBOL(iget5_locked); -/* - * get_new_inode_fast is the fast path version of get_new_inode, see the - * comment at iget_locked for details. +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). */ -static struct inode *get_new_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) +struct inode *iget_locked(struct super_block *sb, unsigned long ino) { + struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + wait_on_inode(inode); + return inode; + } + inode = alloc_inode(sb); if (inode) { struct inode *old; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); /* We released the lock, so.. */ old = find_inode_fast(sb, head, ino); if (!old) { inode->i_ino = ino; - hlist_add_head(&inode->i_hash, head); - __inode_sb_list_add(inode); + spin_lock(&inode->i_lock); inode->i_state = I_NEW; - spin_unlock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -963,13 +1063,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb, * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_hash_lock); destroy_inode(inode); inode = old; wait_on_inode(inode); } return inode; } +EXPORT_SYMBOL(iget_locked); /* * search the inode cache for a matching inode number. @@ -984,10 +1085,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino) struct hlist_node *node; struct inode *inode; + spin_lock(&inode_hash_lock); hlist_for_each_entry(inode, node, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) + if (inode->i_ino == ino && inode->i_sb == sb) { + spin_unlock(&inode_hash_lock); return 0; + } } + spin_unlock(&inode_hash_lock); return 1; } @@ -1017,7 +1122,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) static unsigned int counter; ino_t res; - spin_lock(&inode_lock); spin_lock(&iunique_lock); do { if (counter <= max_reserved) @@ -1025,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved) res = counter++; } while (!test_inode_iunique(sb, res)); spin_unlock(&iunique_lock); - spin_unlock(&inode_lock); return res; } @@ -1033,116 +1136,50 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { - spin_lock(&inode_lock); - if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { __iget(inode); - else + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); /* * Handle the case where s_op->clear_inode is not been * called yet, and somebody is calling igrab * while the inode is getting freed. */ inode = NULL; - spin_unlock(&inode_lock); + } return inode; } EXPORT_SYMBOL(igrab); /** - * ifind - internal function, you want ilookup5() or iget5(). - * @sb: super block of file system to search - * @head: the head of the list to search - * @test: callback used for comparisons between inodes - * @data: opaque data pointer to pass to @test - * @wait: if true wait for the inode to be unlocked, if false do not - * - * ifind() searches for the inode specified by @data in the inode - * cache. This is a generalized version of ifind_fast() for file systems where - * the inode number is not sufficient for unique identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. - * - * Note, @test is called with the inode_lock held, so can't sleep. - */ -static struct inode *ifind(struct super_block *sb, - struct hlist_head *head, int (*test)(struct inode *, void *), - void *data, const int wait) -{ - struct inode *inode; - - spin_lock(&inode_lock); - inode = find_inode(sb, head, test, data); - if (inode) { - spin_unlock(&inode_lock); - if (likely(wait)) - wait_on_inode(inode); - return inode; - } - spin_unlock(&inode_lock); - return NULL; -} - -/** - * ifind_fast - internal function, you want ilookup() or iget(). - * @sb: super block of file system to search - * @head: head of the list to search - * @ino: inode number to search for - * - * ifind_fast() searches for the inode @ino in the inode cache. This is for - * file systems where the inode number is sufficient for unique identification - * of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. - */ -static struct inode *ifind_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) -{ - struct inode *inode; - - spin_lock(&inode_lock); - inode = find_inode_fast(sb, head, ino); - if (inode) { - spin_unlock(&inode_lock); - wait_on_inode(inode); - return inode; - } - spin_unlock(&inode_lock); - return NULL; -} - -/** * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * + * Search for the inode specified by @hashval and @data in the inode cache. * If the inode is in the cache, the inode is returned with an incremented - * reference count. Note, the inode lock is not waited upon so you have to be - * very careful what you do with the returned inode. You probably should be - * using ilookup5() instead. + * reference count. * - * Otherwise NULL is returned. + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); - return ifind(sb, head, test, data, 0); + return inode; } EXPORT_SYMBOL(ilookup5_nowait); @@ -1153,24 +1190,24 @@ EXPORT_SYMBOL(ilookup5_nowait); * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test * - * ilookup5() uses ifind() to search for the inode specified by @hashval and - * @data in the inode cache. This is a generalized version of ilookup() for - * file systems where the inode number is not sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode lock is waited upon and the inode is + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. * returned with an incremented reference count. * - * Otherwise NULL is returned. + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. * - * Note, @test is called with the inode_lock held, so can't sleep. + * Note: @test is called with the inode_hash_lock held, so can't sleep. */ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode = ilookup5_nowait(sb, hashval, test, data); - return ifind(sb, head, test, data, 1); + if (inode) + wait_on_inode(inode); + return inode; } EXPORT_SYMBOL(ilookup5); @@ -1179,91 +1216,23 @@ EXPORT_SYMBOL(ilookup5); * @sb: super block of file system to search * @ino: inode number to search for * - * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. - * This is for file systems where the inode number is sufficient for unique - * identification of an inode. - * - * If the inode is in the cache, the inode is returned with an incremented - * reference count. - * - * Otherwise NULL is returned. + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); - - return ifind_fast(sb, head, ino); -} -EXPORT_SYMBOL(ilookup); - -/** - * iget5_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @hashval: hash value (usually inode number) to get - * @test: callback used for comparisons between inodes - * @set: callback used to initialize a new struct inode - * @data: opaque data pointer to pass to @test and @set - * - * iget5_locked() uses ifind() to search for the inode specified by @hashval - * and @data in the inode cache and if present it is returned with an increased - * reference count. This is a generalized version of iget_locked() for file - * systems where the inode number is not sufficient for unique identification - * of an inode. - * - * If the inode is not in cache, get_new_inode() is called to allocate a new - * inode and this is returned locked, hashed, and with the I_NEW flag set. The - * file system gets to fill it in before unlocking it via unlock_new_inode(). - * - * Note both @test and @set are called with the inode_lock held, so can't sleep. - */ -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) -{ - struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - inode = ifind(sb, head, test, data, 1); - if (inode) - return inode; - /* - * get_new_inode() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode(sb, head, test, set, data); -} -EXPORT_SYMBOL(iget5_locked); - -/** - * iget_locked - obtain an inode from a mounted file system - * @sb: super block of file system - * @ino: inode number to get - * - * iget_locked() uses ifind_fast() to search for the inode specified by @ino in - * the inode cache and if present it is returned with an increased reference - * count. This is for file systems where the inode number is sufficient for - * unique identification of an inode. - * - * If the inode is not in cache, get_new_inode_fast() is called to allocate a - * new inode and this is returned locked, hashed, and with the I_NEW flag set. - * The file system gets to fill it in before unlocking it via - * unlock_new_inode(). - */ -struct inode *iget_locked(struct super_block *sb, unsigned long ino) -{ - struct hlist_head *head = inode_hashtable + hash(sb, ino); - struct inode *inode; + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); - inode = ifind_fast(sb, head, ino); if (inode) - return inode; - /* - * get_new_inode_fast() will do the right thing, re-trying the search - * in case it had to block at any point. - */ - return get_new_inode_fast(sb, head, ino); + wait_on_inode(inode); + return inode; } -EXPORT_SYMBOL(iget_locked); +EXPORT_SYMBOL(ilookup); int insert_inode_locked(struct inode *inode) { @@ -1271,27 +1240,33 @@ int insert_inode_locked(struct inode *inode) ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - inode->i_state |= I_NEW; while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1308,29 +1283,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - inode->i_state |= I_NEW; - while (1) { struct hlist_node *node; struct inode *old = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); hlist_for_each_entry(old, node, head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) continue; - if (old->i_state & (I_FREEING|I_WILL_FREE)) + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); continue; + } break; } if (likely(!node)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); return 0; } __iget(old); - spin_unlock(&inode_lock); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -1375,47 +1355,35 @@ static void iput_final(struct inode *inode) const struct super_operations *op = inode->i_sb->s_op; int drop; + WARN_ON(inode->i_state & I_NEW); + if (op && op->drop_inode) drop = op->drop_inode(inode); else drop = generic_drop_inode(inode); + if (!drop && (sb->s_flags & MS_ACTIVE)) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) + inode_lru_list_add(inode); + spin_unlock(&inode->i_lock); + return; + } + if (!drop) { - if (sb->s_flags & MS_ACTIVE) { - inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) { - inode_lru_list_add(inode); - } - spin_unlock(&inode_lock); - return; - } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); write_inode_now(inode, 1); - spin_lock(&inode_lock); + spin_lock(&inode->i_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - __remove_inode_hash(inode); } - WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - - /* - * Move the inode off the IO lists and LRU once I_FREEING is - * set so that it won't get moved back on there if it is dirty. - */ inode_lru_list_del(inode); - list_del_init(&inode->i_wb_list); + spin_unlock(&inode->i_lock); - __inode_sb_list_del(inode); - spin_unlock(&inode_lock); evict(inode); - remove_inode_hash(inode); - wake_up_inode(inode); - BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - destroy_inode(inode); } /** @@ -1432,7 +1400,7 @@ void iput(struct inode *inode) if (inode) { BUG_ON(inode->i_state & I_CLEAR); - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) iput_final(inode); } } @@ -1611,9 +1579,8 @@ EXPORT_SYMBOL(inode_wait); * to recheck inode state. * * It doesn't matter if I_NEW is not set initially, a call to - * wake_up_inode() after removing from the hash list will DTRT. - * - * This is called with inode_lock held. + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. */ static void __wait_on_freeing_inode(struct inode *inode) { @@ -1621,10 +1588,11 @@ static void __wait_on_freeing_inode(struct inode *inode) DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); schedule(); finish_wait(wq, &wait.wait); - spin_lock(&inode_lock); + spin_lock(&inode_hash_lock); } static __initdata unsigned long ihash_entries; diff --git a/fs/internal.h b/fs/internal.h index 8318059b42c..b29c46e4e32 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -125,6 +125,13 @@ extern long do_handle_open(int mountdirfd, /* * inode.c */ +extern spinlock_t inode_sb_list_lock; + +/* + * fs-writeback.c + */ +extern void inode_wb_list_del(struct inode *inode); + extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4f9cc048294..3e93cdd1900 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -31,7 +31,7 @@ * is used to release xattr name/value pair and detach from c->xattrindex. * reclaim_xattr_datum(c) * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when - * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold + * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold * is hard coded as 32KiB. * do_verify_xattr_datum(c, xd) * is used to load the xdatum informations without name/value pair from the medium. diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index 03b8c240aed..edfea7a3a74 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) return ret; } -/* called with inode_lock held */ +/* called with inode->i_lock held */ static int logfs_drop_inode(struct inode *inode) { struct logfs_super *super = logfs_super(inode->i_sb); diff --git a/fs/namei.c b/fs/namei.c index d0066e17d45..3cb616d38d9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -992,6 +992,12 @@ int follow_down_one(struct path *path) return 0; } +static inline bool managed_dentry_might_block(struct dentry *dentry) +{ + return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && + dentry->d_op->d_manage(dentry, true) < 0); +} + /* * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we * meet a managed dentry and we're not walking to "..". True is returned to @@ -1000,19 +1006,26 @@ int follow_down_one(struct path *path) static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, bool reverse_transit) { - while (d_mountpoint(path->dentry)) { + for (;;) { struct vfsmount *mounted; - if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && - !reverse_transit && - path->dentry->d_op->d_manage(path->dentry, true) < 0) + /* + * Don't forget we might have a non-mountpoint managed dentry + * that wants to block transit. + */ + *inode = path->dentry->d_inode; + if (!reverse_transit && + unlikely(managed_dentry_might_block(path->dentry))) return false; + + if (!d_mountpoint(path->dentry)) + break; + mounted = __lookup_mnt(path->mnt, path->dentry, 1); if (!mounted) break; path->mnt = mounted; path->dentry = mounted->mnt_root; nd->seq = read_seqcount_begin(&path->dentry->d_seq); - *inode = path->dentry->d_inode; } if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index abdf38d5971..7237672216c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -44,6 +44,7 @@ /* #define NFS_DEBUG_VERBOSE 1 */ static int nfs_opendir(struct inode *, struct file *); +static int nfs_closedir(struct inode *, struct file *); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); @@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = { .read = generic_read_dir, .readdir = nfs_readdir, .open = nfs_opendir, - .release = nfs_release, + .release = nfs_closedir, .fsync = nfs_fsync_dir, }; @@ -133,13 +134,35 @@ const struct inode_operations nfs4_dir_inode_operations = { #endif /* CONFIG_NFS_V4 */ +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred) +{ + struct nfs_open_dir_context *ctx; + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + ctx->duped = 0; + ctx->dir_cookie = 0; + ctx->dup_cookie = 0; + ctx->cred = get_rpccred(cred); + } else + ctx = ERR_PTR(-ENOMEM); + return ctx; +} + +static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx) +{ + put_rpccred(ctx->cred); + kfree(ctx); +} + /* * Open file */ static int nfs_opendir(struct inode *inode, struct file *filp) { - int res; + int res = 0; + struct nfs_open_dir_context *ctx; + struct rpc_cred *cred; dfprintk(FILE, "NFS: open dir(%s/%s)\n", filp->f_path.dentry->d_parent->d_name.name, @@ -147,8 +170,15 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - /* Call generic open code in order to cache credentials */ - res = nfs_open(inode, filp); + cred = rpc_lookup_cred(); + if (IS_ERR(cred)) + return PTR_ERR(cred); + ctx = alloc_nfs_open_dir_context(cred); + if (IS_ERR(ctx)) { + res = PTR_ERR(ctx); + goto out; + } + filp->private_data = ctx; if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { /* This is a mountpoint, so d_revalidate will never * have been called, so we need to refresh the @@ -156,9 +186,18 @@ nfs_opendir(struct inode *inode, struct file *filp) */ __nfs_revalidate_inode(NFS_SERVER(inode), inode); } +out: + put_rpccred(cred); return res; } +static int +nfs_closedir(struct inode *inode, struct file *filp) +{ + put_nfs_open_dir_context(filp->private_data); + return 0; +} + struct nfs_cache_array_entry { u64 cookie; u64 ino; @@ -284,19 +323,20 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri { loff_t diff = desc->file->f_pos - desc->current_index; unsigned int index; + struct nfs_open_dir_context *ctx = desc->file->private_data; if (diff < 0) goto out_eof; if (diff >= array->size) { if (array->eof_index >= 0) goto out_eof; - desc->current_index += array->size; return -EAGAIN; } index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; + ctx->duped = 0; return 0; out_eof: desc->eof = 1; @@ -307,10 +347,18 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { int i; + loff_t new_pos; int status = -EAGAIN; + struct nfs_open_dir_context *ctx = desc->file->private_data; for (i = 0; i < array->size; i++) { if (array->array[i].cookie == *desc->dir_cookie) { + new_pos = desc->current_index + i; + if (new_pos < desc->file->f_pos) { + ctx->dup_cookie = *desc->dir_cookie; + ctx->duped = 1; + } + desc->file->f_pos = new_pos; desc->cache_entry_index = i; return 0; } @@ -342,6 +390,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) if (status == -EAGAIN) { desc->last_cookie = array->last_cookie; + desc->current_index += array->size; desc->page_index++; } nfs_readdir_release_array(desc->page); @@ -354,7 +403,8 @@ static int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct file *file, struct inode *inode) { - struct rpc_cred *cred = nfs_file_cred(file); + struct nfs_open_dir_context *ctx = file->private_data; + struct rpc_cred *cred = ctx->cred; unsigned long timestamp, gencount; int error; @@ -693,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; + struct nfs_open_dir_context *ctx = file->private_data; + + if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) { + if (printk_ratelimit()) { + pr_notice("NFS: directory %s/%s contains a readdir loop. " + "Please contact your server vendor. " + "Offending cookie: %llu\n", + file->f_dentry->d_parent->d_name.name, + file->f_dentry->d_name.name, + *desc->dir_cookie); + } + res = -ELOOP; + goto out; + } array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -785,6 +849,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; + struct nfs_open_dir_context *dir_ctx = filp->private_data; int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", @@ -801,7 +866,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) memset(desc, 0, sizeof(*desc)); desc->file = filp; - desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; + desc->dir_cookie = &dir_ctx->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); @@ -853,6 +918,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", dentry->d_parent->d_name.name, @@ -872,7 +938,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) } if (offset != filp->f_pos) { filp->f_pos = offset; - nfs_file_open_context(filp)->dir_cookie = 0; + dir_ctx->dir_cookie = 0; + dir_ctx->duped = 0; } out: mutex_unlock(&inode->i_mutex); @@ -1068,7 +1135,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd) if (fhandle == NULL || fattr == NULL) goto out_error; - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_bad; if (nfs_compare_fh(NFS_FH(inode), fhandle)) @@ -1224,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error == -ENOENT) goto no_entry; if (error < 0) { @@ -1562,7 +1629,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, if (dentry->d_inode) goto out; if (fhandle->size == 0) { - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); + error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr); if (error) goto out_error; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d85a534b15c..3ac5bd695e5 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync) ret = xchg(&ctx->error, 0); if (!ret && status < 0) ret = status; + if (!ret && !datasync) + /* application has asked for meta-data sync */ + ret = pnfs_layoutcommit_inode(inode, true); return ret; } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 1084792bc0f..dcb61548887 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -222,6 +222,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh, goto out; } + if (fattr->valid & NFS_ATTR_FATTR_FSID && + !nfs_fsid_equal(&server->fsid, &fattr->fsid)) + memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); + inode = nfs_fhget(sb, mntfh, fattr); if (IS_ERR(inode)) { dprintk("nfs_get_root: get root inode failed\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 01768e5e2c9..57bb31ad7a5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -254,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) + nfs_attr_check_mountpoint(sb, fattr); + + if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -298,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ - if ((fattr->valid & NFS_ATTR_FATTR_FSID) - && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { + if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || + fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else @@ -639,7 +641,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; - ctx->dir_cookie = 0; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); @@ -1471,6 +1472,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; + atomic_set(&nfsi->commits_outstanding, 0); #endif } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 72e0bddf7a2..ce118ce885d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) +{ + if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) + fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; @@ -214,6 +220,7 @@ extern const u32 nfs41_maxwrite_overhead; /* nfs4proc.c */ #ifdef CONFIG_NFS_V4 extern struct rpc_procinfo nfs4_procedures[]; +void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *); #endif extern int nfs4_init_ds_session(struct nfs_client *clp); @@ -276,11 +283,25 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern void nfs_commit_free(struct nfs_write_data *p); extern int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg); +void nfs_commit_clear_lock(struct nfs_inode *nfsi); +void nfs_commitdata_release(void *data); +void nfs_commit_release_pages(struct nfs_write_data *data); + #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *); @@ -296,12 +317,14 @@ extern int nfs4_init_client(struct nfs_client *clp, rpc_authflavor_t authflavour, int noresvport); extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); -extern int _nfs4_call_sync(struct nfs_server *server, +extern int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply); -extern int _nfs4_call_sync_session(struct nfs_server *server, +extern int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index bf1c68009ff..ad92bf731ff 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <linux/sunrpc/clnt.h> #include <linux/vfs.h> +#include <linux/sunrpc/gss_api.h> #include "internal.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -27,7 +28,8 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ; static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr); + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor); /* * nfs_path - reconstruct the path given an arbitrary dentry @@ -116,6 +118,100 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } +#ifdef CONFIG_NFS_V4 +static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode) +{ + struct gss_api_mech *mech; + struct xdr_netobj oid; + int i; + rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX; + + for (i = 0; i < flavors->num_flavors; i++) { + struct nfs4_secinfo_flavor *flavor; + flavor = &flavors->flavors[i]; + + if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) { + pseudoflavor = flavor->flavor; + break; + } else if (flavor->flavor == RPC_AUTH_GSS) { + oid.len = flavor->gss.sec_oid4.len; + oid.data = flavor->gss.sec_oid4.data; + mech = gss_mech_get_by_OID(&oid); + if (!mech) + continue; + pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service); + gss_mech_put(mech); + break; + } + } + + return pseudoflavor; +} + +static rpc_authflavor_t nfs_negotiate_security(const struct dentry *parent, const struct dentry *dentry) +{ + int status = 0; + struct page *page; + struct nfs4_secinfo_flavors *flavors; + int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); + rpc_authflavor_t flavor = RPC_AUTH_UNIX; + + secinfo = NFS_PROTO(parent->d_inode)->secinfo; + if (secinfo != NULL) { + page = alloc_page(GFP_KERNEL); + if (!page) { + status = -ENOMEM; + goto out; + } + flavors = page_address(page); + status = secinfo(parent->d_inode, &dentry->d_name, flavors); + flavor = nfs_find_best_sec(flavors, dentry->d_inode); + put_page(page); + } + + return flavor; + +out: + status = -ENOMEM; + return status; +} + +static rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent, + struct dentry *dentry, struct path *path, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + rpc_authflavor_t flavor; + struct rpc_clnt *clone; + struct rpc_auth *auth; + int err; + + flavor = nfs_negotiate_security(parent, path->dentry); + if (flavor < 0) + goto out; + clone = rpc_clone_client(server->client); + auth = rpcauth_create(flavor, clone); + if (!auth) { + flavor = -EIO; + goto out; + } + err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode, + &path->dentry->d_name, + fh, fattr); + if (err < 0) + flavor = err; +out: + return flavor; +} +#else /* CONFIG_NFS_V4 */ +static inline rpc_authflavor_t nfs_lookup_with_sec(struct nfs_server *server, + struct dentry *parent, struct dentry *dentry, + struct path *path, struct nfs_fh *fh, + struct nfs_fattr *fattr) +{ + return -EPERM; +} +#endif /* CONFIG_NFS_V4 */ + /* * nfs_d_automount - Handle crossing a mountpoint on the server * @path - The mountpoint @@ -136,6 +232,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; int err; + rpc_authflavor_t flavor = 1; dprintk("--> nfs_d_automount()\n"); @@ -153,9 +250,16 @@ struct vfsmount *nfs_d_automount(struct path *path) /* Look it up again to get its attributes */ parent = dget_parent(path->dentry); - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, + err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode, &path->dentry->d_name, fh, fattr); + if (err == -EPERM) { + flavor = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr); + if (flavor < 0) + err = flavor; + else + err = 0; + } dput(parent); if (err != 0) { mnt = ERR_PTR(err); @@ -165,7 +269,7 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) mnt = nfs_do_refmount(path->dentry); else - mnt = nfs_do_submount(path->dentry, fh, fattr); + mnt = nfs_do_submount(path->dentry, fh, fattr, flavor); if (IS_ERR(mnt)) goto out; @@ -232,17 +336,20 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, * @dentry - parent directory * @fh - filehandle for new root dentry * @fattr - attributes for new root inode + * @authflavor - security flavor to use when performing the mount * */ static struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, - struct nfs_fattr *fattr) + struct nfs_fattr *fattr, + rpc_authflavor_t authflavor) { struct nfs_clone_mount mountdata = { .sb = dentry->d_sb, .dentry = dentry, .fh = fh, .fattr = fattr, + .authflavor = authflavor, }; struct vfsmount *mnt = ERR_PTR(-ENOMEM); char *page = (char *) __get_free_page(GFP_USER); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0c80d8b3f9..38053d823eb 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct qstr *name, +nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs3_diropargs arg = { diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c64be1cff08..e1c261ddd65 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -57,7 +57,8 @@ enum nfs4_session_state { struct nfs4_minor_version_ops { u32 minor_version; - int (*call_sync)(struct nfs_server *server, + int (*call_sync)(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -262,6 +263,8 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + bool sync); static inline bool is_ds_only_client(struct nfs_client *clp) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 42855846481..6f8192f4cfc 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -154,6 +154,23 @@ static int filelayout_read_done_cb(struct rpc_task *task, } /* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + */ +static void +filelayout_set_layoutcommit(struct nfs_write_data *wdata) +{ + if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds || + wdata->res.verf->committed == NFS_FILE_SYNC) + return; + + pnfs_set_layoutcommit(wdata); + dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino, + (unsigned long) wdata->lseg->pls_end_pos); +} + +/* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its * original value. @@ -210,6 +227,38 @@ static int filelayout_write_done_cb(struct rpc_task *task, return -EAGAIN; } + filelayout_set_layoutcommit(data); + return 0; +} + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +static void prepare_to_resend_writes(struct nfs_write_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(data->verf.verifier, first->wb_verf.verifier, + sizeof(first->wb_verf.verifier)); + data->verf.verifier[0]++; /* ensure verifier mismatch */ +} + +static int filelayout_commit_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + prepare_to_resend_writes(data); + filelayout_set_lo_fail(data->lseg); + } else + nfs_restart_rpc(task, data->ds_clp); + return -EAGAIN; + } + return 0; } @@ -240,6 +289,16 @@ static void filelayout_write_release(void *data) wdata->mds_ops->rpc_release(data); } +static void filelayout_commit_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + nfs_commit_release_pages(wdata); + if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding)) + nfs_commit_clear_lock(NFS_I(wdata->inode)); + nfs_commitdata_release(wdata); +} + struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, @@ -252,6 +311,12 @@ struct rpc_call_ops filelayout_write_call_ops = { .rpc_release = filelayout_write_release, }; +struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_commit_release, +}; + static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { @@ -320,10 +385,6 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync) data->inode->i_ino, sync, (size_t) data->args.count, offset, ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); - /* We can't handle commit to ds yet */ - if (!FILELAYOUT_LSEG(lseg)->commit_through_mds) - data->args.stable = NFS_FILE_SYNC; - data->write_done_cb = filelayout_write_done_cb; data->ds_clp = ds->ds_clp; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -441,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, struct nfs4_layoutget_res *lgr, struct nfs4_deviceid *id) { - uint32_t *p = (uint32_t *)lgr->layout.buf; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = lgr->layoutp->pages, + .page_len = lgr->layoutp->len, + .buflen = lgr->layoutp->len, + .len = lgr->layoutp->len, + }; + struct page *scratch; + __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + return -ENOMEM; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), + * num_fh (4) */ + p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20); + if (unlikely(!p)) + goto out_err; + memcpy(id, p, sizeof(*id)); p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); print_deviceid(id); @@ -468,32 +550,57 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); + if (!fl->num_fh) + goto out_err; + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), GFP_KERNEL); if (!fl->fh_array) - return -ENOMEM; + goto out_err; for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); - if (!fl->fh_array[i]) { - filelayout_free_fh_array(fl); - return -ENOMEM; - } + if (!fl->fh_array[i]) + goto out_err_free; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free; fl->fh_array[i]->size = be32_to_cpup(p++); if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { printk(KERN_ERR "Too big fh %d received %d\n", i, fl->fh_array[i]->size); - filelayout_free_fh_array(fl); - return -EIO; + goto out_err_free; } + + p = xdr_inline_decode(&stream, fl->fh_array[i]->size); + if (unlikely(!p)) + goto out_err_free; memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); - p += XDR_QUADLEN(fl->fh_array[i]->size); dprintk("DEBUG: %s: fh len %d\n", __func__, fl->fh_array[i]->size); } + __free_page(scratch); return 0; + +out_err_free: + filelayout_free_fh_array(fl); +out_err: + __free_page(scratch); + return -EIO; +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + + dprintk("--> %s\n", __func__); + nfs4_fl_put_deviceid(fl->dsaddr); + kfree(fl->commit_buckets); + _filelayout_free_lseg(fl); } static struct pnfs_layout_segment * @@ -514,17 +621,28 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, _filelayout_free_lseg(fl); return NULL; } - return &fl->generic_hdr; -} -static void -filelayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - - dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); - _filelayout_free_lseg(fl); + /* This assumes there is only one IOMODE_RW lseg. What + * we really want to do is have a layout_hdr level + * dictionary of <multipath_list4, fh> keys, each + * associated with a struct list_head, populated by calls + * to filelayout_write_pagelist(). + * */ + if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) { + int i; + int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL); + if (!fl->commit_buckets) { + filelayout_free_lseg(&fl->generic_hdr); + return NULL; + } + fl->number_of_buckets = size; + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&fl->commit_buckets[i]); + } + return &fl->generic_hdr; } /* @@ -552,6 +670,191 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, return (p_stripe == r_stripe); } +static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg) +{ + return !FILELAYOUT_LSEG(lseg)->commit_through_mds; +} + +static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) +{ + if (fl->stripe_type == STRIPE_SPARSE) + return nfs4_fl_calc_ds_index(&fl->generic_hdr, j); + else + return j; +} + +struct list_head *filelayout_choose_commit_list(struct nfs_page *req) +{ + struct pnfs_layout_segment *lseg = req->wb_commit_lseg; + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + u32 i, j; + struct list_head *list; + + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. + */ + j = nfs4_fl_calc_j_index(lseg, + (loff_t)req->wb_index << PAGE_CACHE_SHIFT); + i = select_bucket_index(fl, j); + list = &fl->commit_buckets[i]; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg */ + get_lseg(lseg); + } + return list; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) + return i; + else + return nfs4_fl_calc_ds_index(lseg, i); +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + } + return flseg->fh_array[i]; +} + +static int filelayout_initiate_commit(struct nfs_write_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + u32 idx; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + prepare_to_resend_writes(data); + data->mds_ops->rpc_release(data); + return -EAGAIN; + } + dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how); + data->write_done_cb = filelayout_commit_done_cb; + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient, + &filelayout_commit_call_ops, how); +} + +/* + * This is only useful while we are using whole file layouts. + */ +static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + spin_lock(&inode->i_lock); + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = get_lseg(lseg); + spin_unlock(&inode->i_lock); + return rv; +} + +static int alloc_ds_commits(struct inode *inode, struct list_head *list) +{ + struct pnfs_layout_segment *lseg; + struct nfs4_filelayout_segment *fl; + struct nfs_write_data *data; + int i, j; + + /* Won't need this when non-whole file layout segments are supported + * instead we will use a pnfs_layout_hdr structure */ + lseg = find_only_write_lseg(inode); + if (!lseg) + return 0; + fl = FILELAYOUT_LSEG(lseg); + for (i = 0; i < fl->number_of_buckets; i++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->ds_commit_index = i; + data->lseg = lseg; + list_add(&data->pages, list); + } + put_lseg(lseg); + return 0; + +out_bad: + for (j = i; j < fl->number_of_buckets; j++) { + if (list_empty(&fl->commit_buckets[i])) + continue; + nfs_retry_commit(&fl->commit_buckets[i], lseg); + put_lseg(lseg); /* associated with emptying bucket */ + } + put_lseg(lseg); + /* Caller will clean up entries put on list */ + return -ENOMEM; +} + +/* This follows nfs_commit_list pretty closely */ +static int +filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how) +{ + struct nfs_write_data *data, *tmp; + LIST_HEAD(list); + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (!data) + goto out_bad; + data->lseg = NULL; + list_add(&data->pages, &list); + } + + if (alloc_ds_commits(inode, &list)) + goto out_bad; + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + atomic_inc(&NFS_I(inode)->commits_outstanding); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL); + nfs_initiate_commit(data, NFS_CLIENT(inode), + data->mds_ops, how); + } else { + nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg); + filelayout_initiate_commit(data, how); + } + } + return 0; + out_bad: + list_for_each_entry_safe(data, tmp, &list, pages) { + nfs_retry_commit(&data->pages, data->lseg); + list_del_init(&data->pages); + nfs_commit_free(data); + } + nfs_retry_commit(mds_pages, NULL); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -559,6 +862,9 @@ static struct pnfs_layoutdriver_type filelayout_type = { .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, + .mark_pnfs_commit = filelayout_mark_pnfs_commit, + .choose_commit_list = filelayout_choose_commit_list, + .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, }; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index ee0c907742b..085a354e0f0 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -79,6 +79,8 @@ struct nfs4_filelayout_segment { struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ unsigned int num_fh; struct nfs_fh **fh_array; + struct list_head *commit_buckets; /* Sort commits to ds */ + int number_of_buckets; }; static inline struct nfs4_filelayout_segment * diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 68143c162e3..de5350f2b24 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -261,7 +261,7 @@ out: * Currently only support ipv4, and one multi-path address. */ static struct nfs4_pnfs_ds * -decode_and_add_ds(__be32 **pp, struct inode *inode) +decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode) { struct nfs4_pnfs_ds *ds = NULL; char *buf; @@ -269,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) u32 ip_addr, port; int nlen, rlen, i; int tmp[2]; - __be32 *r_netid, *r_addr, *p = *pp; + __be32 *p; /* r_netid */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; nlen = be32_to_cpup(p++); - r_netid = p; - p += XDR_QUADLEN(nlen); - /* r_addr */ - rlen = be32_to_cpup(p++); - r_addr = p; - p += XDR_QUADLEN(rlen); - *pp = p; + p = xdr_inline_decode(streamp, nlen); + if (unlikely(!p)) + goto out_err; /* Check that netid is "tcp" */ - if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { + if (nlen != 3 || memcmp((char *)p, "tcp", 3)) { dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); goto out_err; } + /* r_addr */ + p = xdr_inline_decode(streamp, 4); + if (unlikely(!p)) + goto out_err; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(streamp, rlen); + if (unlikely(!p)) + goto out_err; + /* ipv6 length plus port is legal */ if (rlen > INET6_ADDRSTRLEN + 8) { dprintk("%s: Invalid address, length %d\n", __func__, @@ -300,7 +309,7 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf[rlen] = '\0'; - memcpy(buf, r_addr, rlen); + memcpy(buf, p, rlen); /* replace the port dots with dashes for the in4_pton() delimiter*/ for (i = 0; i < 2; i++) { @@ -336,90 +345,154 @@ out_err: static struct nfs4_file_layout_dsaddr* decode_device(struct inode *ino, struct pnfs_device *pdev) { - int i, dummy; + int i; u32 cnt, num; u8 *indexp; - __be32 *p = (__be32 *)pdev->area, *indicesp; - struct nfs4_file_layout_dsaddr *dsaddr; + __be32 *p; + u8 *stripe_indices; + u8 max_stripe_index; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + struct xdr_stream stream; + struct xdr_buf buf = { + .pages = pdev->pages, + .page_len = pdev->pglen, + .buflen = pdev->pglen, + .len = pdev->pglen, + }; + struct page *scratch; + + /* set up xdr stream */ + scratch = alloc_page(GFP_KERNEL); + if (!scratch) + goto out_err; + + xdr_init_decode(&stream, &buf, NULL); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); /* Get the stripe count (number of stripe index) */ - cnt = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_scratch; + + cnt = be32_to_cpup(p); dprintk("%s stripe count %d\n", __func__, cnt); if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { printk(KERN_WARNING "%s: stripe count %d greater than " "supported maximum %d\n", __func__, cnt, NFS4_PNFS_MAX_STRIPE_CNT); - goto out_err; + goto out_err_free_scratch; + } + + /* read stripe indices */ + stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL); + if (!stripe_indices) + goto out_err_free_scratch; + + p = xdr_inline_decode(&stream, cnt << 2); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + indexp = &stripe_indices[0]; + max_stripe_index = 0; + for (i = 0; i < cnt; i++) { + *indexp = be32_to_cpup(p++); + max_stripe_index = max(max_stripe_index, *indexp); + indexp++; } /* Check the multipath list count */ - indicesp = p; - p += XDR_QUADLEN(cnt << 2); - num = be32_to_cpup(p++); + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_stripe_indices; + + num = be32_to_cpup(p); dprintk("%s ds_num %u\n", __func__, num); if (num > NFS4_PNFS_MAX_MULTI_CNT) { printk(KERN_WARNING "%s: multipath count %d greater than " "supported maximum %d\n", __func__, num, NFS4_PNFS_MAX_MULTI_CNT); - goto out_err; + goto out_err_free_stripe_indices; } + + /* validate stripe indices are all < num */ + if (max_stripe_index >= num) { + printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n", + __func__, max_stripe_index, num); + goto out_err_free_stripe_indices; + } + dsaddr = kzalloc(sizeof(*dsaddr) + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), GFP_KERNEL); if (!dsaddr) - goto out_err; - - dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); - if (!dsaddr->stripe_indices) - goto out_err_free; + goto out_err_free_stripe_indices; dsaddr->stripe_count = cnt; + dsaddr->stripe_indices = stripe_indices; + stripe_indices = NULL; dsaddr->ds_num = num; memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); - /* Go back an read stripe indices */ - p = indicesp; - indexp = &dsaddr->stripe_indices[0]; - for (i = 0; i < dsaddr->stripe_count; i++) { - *indexp = be32_to_cpup(p++); - if (*indexp >= num) - goto out_err_free; - indexp++; - } - /* Skip already read multipath list count */ - p++; - for (i = 0; i < dsaddr->ds_num; i++) { int j; + u32 mp_count; + + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; - dummy = be32_to_cpup(p++); /* multipath count */ - if (dummy > 1) { + mp_count = be32_to_cpup(p); /* multipath count */ + if (mp_count > 1) { printk(KERN_WARNING "%s: Multipath count %d not supported, " "skipping all greater than 1\n", __func__, - dummy); + mp_count); } - for (j = 0; j < dummy; j++) { + for (j = 0; j < mp_count; j++) { if (j == 0) { - dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + dsaddr->ds_list[i] = decode_and_add_ds(&stream, + ino); if (dsaddr->ds_list[i] == NULL) - goto out_err_free; + goto out_err_free_deviceid; } else { u32 len; /* skip extra multipath */ - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - len = be32_to_cpup(p++); - p += XDR_QUADLEN(len); - continue; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; + + /* read len, skip */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_free_deviceid; + len = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, len); + if (unlikely(!p)) + goto out_err_free_deviceid; } } } + + __free_page(scratch); return dsaddr; -out_err_free: +out_err_free_deviceid: nfs4_fl_free_deviceid(dsaddr); + /* stripe_indicies was part of dsaddr */ + goto out_err_free_scratch; +out_err_free_stripe_indices: + kfree(stripe_indices); +out_err_free_scratch: + __free_page(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; @@ -498,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) goto out_free; } - /* set pdev->area */ - pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); - if (!pdev->area) - goto out_free; - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); pdev->layout_type = LAYOUT_NFSV4_1_FILES; pdev->pages = pages; @@ -521,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id) */ dsaddr = decode_and_add_device(inode, pdev); out_free: - if (pdev->area != NULL) - vunmap(pdev->area); for (i = 0; i < max_pages; i++) __free_page(pages[i]); kfree(pages); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1d84e7088af..dfd1e6d7e6c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -41,6 +41,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -71,7 +72,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); +static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -85,6 +88,8 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_WRONGSEC: + return -EPERM; case -NFS4ERR_BADOWNER: case -NFS4ERR_BADNAME: return -EINVAL; @@ -657,7 +662,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_server *server, +static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -673,7 +679,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = server->client, + .rpc_client = clnt, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -692,13 +698,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server, return ret; } -int _nfs4_call_sync_session(struct nfs_server *server, +int _nfs4_call_sync_session(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0); } #else @@ -709,19 +716,28 @@ static int nfs4_sequence_done(struct rpc_task *task, } #endif /* CONFIG_NFS_V4_1 */ -int _nfs4_call_sync(struct nfs_server *server, +int _nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply) { args->sa_session = res->sr_session = NULL; - return rpc_call_sync(server->client, msg, 0); + return rpc_call_sync(clnt, msg, 0); } -#define nfs4_call_sync(server, msg, args, res, cache_reply) \ - (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ - &(res)->seq_res, (cache_reply)) +static inline +int nfs4_call_sync(struct rpc_clnt *clnt, + struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply) +{ + return server->nfs_client->cl_mvops->call_sync(clnt, server, msg, + args, res, cache_reply); +} static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { @@ -1831,7 +1847,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); return status; @@ -2090,7 +2106,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f }; int status; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| @@ -2160,7 +2176,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(info->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, @@ -2176,15 +2192,43 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, return err; } +static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info, rpc_authflavor_t flavor) +{ + struct rpc_auth *auth; + int ret; + + auth = rpcauth_create(flavor, server->client); + if (!auth) { + ret = -EIO; + goto out; + } + ret = nfs4_lookup_root(server, fhandle, info); + if (ret < 0) + ret = -EAGAIN; +out: + return ret; +} + /* * get the file handle for the "/" directory on the server */ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { - int status; + int i, len, status = 0; + rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2]; + + flav_array[0] = RPC_AUTH_UNIX; + len = gss_mech_list_pseudoflavors(&flav_array[1]); + flav_array[1+len] = RPC_AUTH_NULL; + len += 2; - status = nfs4_lookup_root(server, fhandle, info); + for (i = 0; i < len; i++) { + status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]); + if (status == 0) + break; + } if (status == 0) status = nfs4_server_capabilities(server, fhandle); if (status == 0) @@ -2249,7 +2293,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) @@ -2309,9 +2353,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, return status; } -static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, - const struct qstr *name, struct nfs_fh *fhandle, - struct nfs_fattr *fattr) +static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server, + const struct nfs_fh *dirfh, const struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { int status; struct nfs4_lookup_arg args = { @@ -2333,7 +2377,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d nfs_fattr_init(fattr); dprintk("NFS call lookupfh %s\n", name->name); - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("NFS reply lookupfh: %d\n", status); return status; } @@ -2345,7 +2389,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, struct nfs4_exception exception = { }; int err; do { - err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); + err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr); /* FIXME: !!!! */ if (err == -NFS4ERR_MOVED) { err = -EREMOTE; @@ -2356,27 +2400,41 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh, return err; } -static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, - struct nfs_fh *fhandle, struct nfs_fattr *fattr) +static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, + const struct qstr *name, struct nfs_fh *fhandle, + struct nfs_fattr *fattr) { int status; dprintk("NFS call lookup %s\n", name->name); - status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); + status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); if (status == -NFS4ERR_MOVED) status = nfs4_get_referral(dir, name, fattr, fhandle); dprintk("NFS reply lookup: %d\n", status); return status; } -static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) +void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh) +{ + memset(fh, 0, sizeof(struct nfs_fh)); + fattr->fsid.major = 1; + fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | + NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT; + fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO; + fattr->nlink = 2; +} + +static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, + struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs4_exception exception = { }; int err; do { err = nfs4_handle_exception(NFS_SERVER(dir), - _nfs4_proc_lookup(dir, name, fhandle, fattr), + _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr), &exception); + if (err == -EPERM) + nfs_fixup_secinfo_attributes(fattr, fhandle); } while (exception.retry); return err; } @@ -2421,7 +2479,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry if (res.fattr == NULL) return -ENOMEM; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (!status) { entry->mask = 0; if (res.access & NFS4_ACCESS_READ) @@ -2488,7 +2546,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page, .rpc_resp = &res, }; - return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); + return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_readlink(struct inode *inode, struct page *page, @@ -2577,7 +2635,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name) if (res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &args, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2678,7 +2736,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name, if (res.old_fattr == NULL || res.new_fattr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(old_dir, &res.old_cinfo); nfs_post_op_update_inode(old_dir, res.old_fattr); @@ -2729,7 +2787,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr * if (res.fattr == NULL || res.dir_attr == NULL) goto out; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { update_changeattr(dir, &res.cinfo); nfs_post_op_update_inode(dir, res.dir_attr); @@ -2792,8 +2850,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) { - int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, - &data->arg, &data->res, 1); + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { update_changeattr(dir, &data->res.dir_cinfo); nfs_post_op_update_inode(dir, data->res.dir_fattr); @@ -2905,7 +2963,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, (unsigned long long)cookie); nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; @@ -2997,7 +3055,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, }; nfs_fattr_init(fsstat->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) @@ -3028,7 +3086,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) @@ -3073,7 +3131,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle } nfs_fattr_init(pathconf->fattr); - return nfs4_call_sync(server, &msg, &args, &res, 0); + return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, @@ -3195,12 +3253,9 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; } -static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); @@ -3210,11 +3265,24 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } +static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); - - data->args.bitmask = server->cache_consistency_bitmask; + + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_commit_done_cb; data->res.server = server; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } @@ -3452,7 +3520,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu resp_buf = buf; buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); } - ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); + ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); if (ret) goto out_free; if (res.acl_len > args.acl_len) @@ -3527,7 +3595,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl if (i < 0) return i; nfs_inode_return_delegation(inode); - ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); /* * Free each page after tx, so the only ref left is @@ -3890,7 +3958,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock lsp = request->fl_u.nfs4_fl.owner; arg.lock_owner.id = lsp->ls_id.id; arg.lock_owner.s_dev = server->s_dev; - status = nfs4_call_sync(server, &msg, &arg, &res, 1); + status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); switch (status) { case 0: request->fl_type = F_UNLCK; @@ -4618,12 +4686,46 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } +static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + int status; + struct nfs4_secinfo_arg args = { + .dir_fh = NFS_FH(dir), + .name = name, + }; + struct nfs4_secinfo_res res = { + .flavors = flavors, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + + dprintk("NFS call secinfo %s\n", name->name); + status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + dprintk("NFS reply secinfo: %d\n", status); + return status; +} + +int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(dir), + _nfs4_proc_secinfo(dir, name, flavors), + &exception); + } while (exception.retry); + return err; +} + #ifdef CONFIG_NFS_V4_1 /* * Check the exchange flags returned by the server for invalid flags, having @@ -5516,8 +5618,6 @@ static void nfs4_layoutget_release(void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); - if (lgp->res.layout.buf != NULL) - free_page((unsigned long) lgp->res.layout.buf); put_nfs_open_context(lgp->args.ctx); kfree(calldata); dprintk("<-- %s\n", __func__); @@ -5549,12 +5649,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) dprintk("--> %s\n", __func__); - lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); - if (lgp->res.layout.buf == NULL) { - nfs4_layoutget_release(lgp); - return -ENOMEM; - } - + lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) @@ -5586,7 +5681,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) int status; dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server, &msg, &args, &res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -5606,6 +5701,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) } EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (nfs4_setup_sequence(server, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { /* Just ignore these failures */ + case NFS4ERR_DELEG_REVOKED: /* layout was recalled */ + case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */ + case NFS4ERR_BADLAYOUT: /* no layout */ + case NFS4ERR_GRACE: /* loca_recalim always false */ + task->tk_status = 0; + } + + if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + nfs_restart_rpc(task, server->nfs_client); + return; + } + + if (task->tk_status == 0) + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); +} + +static void nfs4_layoutcommit_release(void *calldata) +{ + struct nfs4_layoutcommit_data *data = calldata; + + /* Matched by references in pnfs_set_layoutcommit */ + put_lseg(data->lseg); + put_rpccred(data->cred); + kfree(data); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +int +nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", + data->task.tk_pid, sync, + data->args.lastbytewritten, + data->args.inode->i_ino); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (sync == false) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return status; +} #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5741,6 +5930,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, .init_client = nfs4_init_client, + .secinfo = nfs4_proc_secinfo, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0cf560f7788..dddfb5795d7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -46,6 +46,7 @@ #include <linux/kdev_t.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/sunrpc/gss_api.h> #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> @@ -112,7 +113,7 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int); (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ (0) +#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) @@ -324,6 +327,18 @@ static int nfs4_stat_to_errno(int); #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + 1 /* reclaim */ + \ + encode_stateid_maxsz + \ + 1 /* new offset (true) */ + \ + 2 /* last byte written */ + \ + 1 /* nt_timechanged (false) */ + \ + 1 /* layoutupdate4 layout type */ + \ + 1 /* NULL filelayout layoutupdate4 payload */) +#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) + #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -676,6 +691,14 @@ static int nfs4_stat_to_errno(int); decode_putfh_maxsz + \ decode_lookup_maxsz + \ decode_fs_locations_maxsz) +#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_secinfo_maxsz) +#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_secinfo_maxsz) #if defined(CONFIG_NFS_V4_1) #define NFS4_enc_exchange_id_sz \ (compound_encode_hdr_maxsz + \ @@ -727,6 +750,17 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -1620,6 +1654,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state hdr->replen += decode_delegreturn_maxsz; } +static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) +{ + int len = name->len; + __be32 *p; + + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_SECINFO); + xdr_encode_opaque(p, name->name, len); + hdr->nops++; + hdr->replen += decode_secinfo_maxsz; +} + #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ static void encode_exchange_id(struct xdr_stream *xdr, @@ -1816,6 +1862,34 @@ encode_layoutget(struct xdr_stream *xdr, hdr->nops++; hdr->replen += decode_layoutget_maxsz; } + +static int +encode_layoutcommit(struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, + NFS_SERVER(args->inode)->pnfs_curr_ld->id); + + p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + /* Only whole file layouts */ + p = xdr_encode_hyper(p, 0); /* offset */ + p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */ + *p++ = cpu_to_be32(0); /* reclaim */ + p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ + *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ + *p++ = cpu_to_be32(0); /* no file layout payload */ + + hdr->nops++; + hdr->replen += decode_layoutcommit_maxsz; + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2294,7 +2368,8 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); encode_commit(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2465,6 +2540,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode SECINFO request + */ +static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_secinfo_arg *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->dir_fh, &hdr); + encode_secinfo(xdr, args->name, &hdr); + encode_nops(&hdr); +} + #if defined(CONFIG_NFS_V4_1) /* * EXCHANGE_ID request @@ -2604,8 +2697,32 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); + + xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, + args->layout.pages, 0, args->layout.pglen); + encode_nops(&hdr); } + +/* + * Encode LAYOUTCOMMIT request + */ +static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + encode_layoutcommit(xdr, args, &hdr); + encode_getfattr(xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2925,6 +3042,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + return -be32_to_cpup(p); } return 0; out_overflow: @@ -3912,6 +4030,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, fattr->valid |= status; status = decode_attr_error(xdr, bitmap); + if (status == -NFS4ERR_WRONGSEC) { + nfs_fixup_secinfo_attributes(fattr, fh); + status = 0; + } if (status < 0) goto xdr_error; @@ -4680,6 +4802,73 @@ static int decode_delegreturn(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_DELEGRETURN); } +static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.sec_oid4.len = be32_to_cpup(p); + if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN) + goto out_err; + + p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len); + if (unlikely(!p)) + goto out_overflow; + memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len); + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + flavor->gss.qop4 = be32_to_cpup(p++); + flavor->gss.service = be32_to_cpup(p); + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +out_err: + return -EINVAL; +} + +static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) +{ + struct nfs4_secinfo_flavor *sec_flavor; + int status; + __be32 *p; + int i; + + status = decode_op_hdr(xdr, OP_SECINFO); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->flavors->num_flavors = be32_to_cpup(p); + + for (i = 0; i < res->flavors->num_flavors; i++) { + sec_flavor = &res->flavors->flavors[i]; + if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE) + break; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sec_flavor->flavor = be32_to_cpup(p); + + if (sec_flavor->flavor == RPC_AUTH_GSS) { + if (decode_secinfo_gss(xdr, sec_flavor)) + break; + } + } + + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + #if defined(CONFIG_NFS_V4_1) static int decode_exchange_id(struct xdr_stream *xdr, struct nfs41_exchange_id_res *res) @@ -4950,6 +5139,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, __be32 *p; int status; u32 layout_count; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + struct kvec *iov = rcvbuf->head; + u32 hdrlen, recvd; status = decode_op_hdr(xdr, OP_LAYOUTGET); if (status) @@ -4966,17 +5158,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, return -EINVAL; } - p = xdr_inline_decode(xdr, 24); + p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; p = xdr_decode_hyper(p, &res->range.offset); p = xdr_decode_hyper(p, &res->range.length); res->range.iomode = be32_to_cpup(p++); res->type = be32_to_cpup(p++); - - status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); - if (unlikely(status)) - return status; + res->layoutp->len = be32_to_cpup(p); dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", __func__, @@ -4984,12 +5173,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, (unsigned long)res->range.length, res->range.iomode, res->type, - res->layout.len); + res->layoutp->len); - /* nfs4_proc_layoutget allocated a single page */ - if (res->layout.len > PAGE_SIZE) - return -ENOMEM; - memcpy(res->layout.buf, p, res->layout.len); + hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (res->layoutp->len > recvd) { + dprintk("NFS: server cheating in layoutget reply: " + "layout len %u > recvd %u\n", + res->layoutp->len, recvd); + return -EINVAL; + } + + xdr_read_pages(xdr, res->layoutp->len); if (layout_count > 1) { /* We only handle a length one array at the moment. Any @@ -5006,6 +5201,35 @@ out_overflow: print_overflow_msg(__func__, xdr); return -EIO; } + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + __u32 sizechanged; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + sizechanged = be32_to_cpup(p); + + if (sizechanged) { + /* throw away new size */ + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -5723,8 +5947,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_commit(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5919,6 +6144,32 @@ out: return status; } +/* + * Decode SECINFO response + */ +static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_secinfo_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_secinfo(xdr, res); + if (status) + goto out; +out: + return status; +} + #if defined(CONFIG_NFS_V4_1) /* * Decode EXCHANGE_ID response @@ -6066,6 +6317,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs4_layoutcommit_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_layoutcommit(xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ /** @@ -6180,10 +6459,6 @@ static struct { { NFS4ERR_SYMLINK, -ELOOP }, { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, { NFS4ERR_DEADLOCK, -EDEADLK }, - { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs - * to be handled by a - * middle-layer. - */ { -1, -EIO } }; @@ -6258,6 +6533,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + PROC(SECINFO, enc_secinfo, dec_secinfo), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), @@ -6267,6 +6543,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 23e79441066..87a593c2b05 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -223,6 +223,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_count = 0; desc->pg_bsize = bsize; desc->pg_base = 0; + desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_doio = doio; desc->pg_ioflags = io_flags; @@ -335,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { while (!nfs_pageio_do_add_request(desc, req)) { + desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; + desc->pg_moreio = 0; } return 1; } @@ -395,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, pgoff_t idx_end; int found, i; int res; + struct list_head *list; res = 0; if (npages == 0) @@ -415,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, idx_start = req->wb_index + 1; if (nfs_set_page_tag_locked(req)) { kref_get(&req->wb_kref); - nfs_list_remove_request(req); radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, tag); - nfs_list_add_request(req, dst); + list = pnfs_choose_commit_list(req, dst); + nfs_list_add_request(req, list); res++; if (res == INT_MAX) goto out; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f38813a0a29..d9ab97269ce 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -259,6 +259,7 @@ put_lseg(struct pnfs_layout_segment *lseg) pnfs_free_lseg_list(&free_me); } } +EXPORT_SYMBOL_GPL(put_lseg); static bool should_free_lseg(u32 lseg_iomode, u32 recall_iomode) @@ -471,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; struct pnfs_layout_segment *lseg = NULL; + struct page **pages = NULL; + int i; + u32 max_resp_sz, max_pages; dprintk("--> %s\n", __func__); @@ -478,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); if (lgp == NULL) return NULL; + + /* allocate pages for xdr post processing */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_err_free; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_err_free; + } + lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range.iomode = iomode; @@ -486,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); + lgp->args.layout.pages = pages; + lgp->args.layout.pglen = max_pages * PAGE_SIZE; lgp->lsegpp = &lseg; /* Synchronously retrieve layout information from server and @@ -496,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo, /* remember that LAYOUTGET failed and suspend trying */ set_bit(lo_fail_bit(iomode), &lo->plh_flags); } + + /* free xdr pages */ + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + return lseg; + +out_err_free: + /* free any allocated xdr pages, lgp as it's not used */ + if (pages) { + for (i = 0; i < max_pages; i++) { + if (!pages[i]) + break; + __free_page(pages[i]); + } + kfree(pages); + } + kfree(lgp); + return NULL; } bool pnfs_roc(struct inode *ino) @@ -945,3 +985,105 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } + +/* + * Currently there is only one (whole file) write lseg. + */ +static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode) +{ + struct pnfs_layout_segment *lseg, *rv = NULL; + + list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + rv = lseg; + return rv; +} + +void +pnfs_set_layoutcommit(struct nfs_write_data *wdata) +{ + struct nfs_inode *nfsi = NFS_I(wdata->inode); + loff_t end_pos = wdata->args.offset + wdata->res.count; + + spin_lock(&nfsi->vfs_inode.i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + /* references matched in nfs4_layoutcommit_release */ + get_lseg(wdata->lseg); + wdata->lseg->pls_lc_cred = + get_rpccred(wdata->args.context->state->owner->so_cred); + mark_inode_dirty_sync(wdata->inode); + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, wdata->inode->i_ino); + } + if (end_pos > wdata->lseg->pls_end_pos) + wdata->lseg->pls_end_pos = end_pos; + spin_unlock(&nfsi->vfs_inode.i_lock); +} +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); + +/* + * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and + * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough + * data to disk to allow the server to recover the data if it crashes. + * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag + * is off, and a COMMIT is sent to a data server, or + * if WRITEs to a data server return NFS_DATA_SYNC. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + struct pnfs_layout_segment *lseg; + struct rpc_cred *cred; + loff_t end_pos; + int status = 0; + + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + return 0; + + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) { + mark_inode_dirty_sync(inode); + status = -ENOMEM; + goto out; + } + + spin_lock(&inode->i_lock); + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + spin_unlock(&inode->i_lock); + kfree(data); + goto out; + } + /* + * Currently only one (whole file) write lseg which is referenced + * in pnfs_set_layoutcommit and will be found. + */ + lseg = pnfs_list_write_lseg(inode); + + end_pos = lseg->pls_end_pos; + cred = lseg->pls_lc_cred; + lseg->pls_end_pos = 0; + lseg->pls_lc_cred = NULL; + + memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data, + sizeof(nfsi->layout->plh_stateid.data)); + spin_unlock(&inode->i_lock); + + data->args.inode = inode; + data->lseg = lseg; + data->cred = cred; + nfs_fattr_init(&data->fattr); + data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + data->res.fattr = &data->fattr; + data->args.lastbytewritten = end_pos - 1; + data->res.server = NFS_SERVER(inode); + + status = nfs4_proc_layoutcommit(data, sync); +out: + dprintk("<-- %s status %d\n", __func__, status); + return status; +} diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 6380b9405bc..bc4827202e7 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -43,6 +43,8 @@ struct pnfs_layout_segment { atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; + struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */ + loff_t pls_end_pos; /* LAYOUTCOMMIT write end */ }; enum pnfs_try_status { @@ -74,6 +76,13 @@ struct pnfs_layoutdriver_type { /* test for nfs page cache coalescing */ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + /* Returns true if layoutdriver wants to divert this request to + * driver's commit routine. + */ + bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); + struct list_head * (*choose_commit_list) (struct nfs_page *req); + int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -100,7 +109,6 @@ struct pnfs_device { unsigned int layout_type; unsigned int mincount; struct page **pages; - void *area; unsigned int pgbase; unsigned int pglen; }; @@ -145,7 +153,8 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier); - +void pnfs_set_layoutcommit(struct nfs_write_data *wdata); +int pnfs_layoutcommit_inode(struct inode *inode, bool sync); static inline int lo_fail_bit(u32 iomode) { @@ -169,6 +178,51 @@ static inline int pnfs_enabled_sb(struct nfs_server *nfss) return nfss->pnfs_curr_ld != NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ + if (lseg) { + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; + if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) { + set_bit(PG_PNFS_COMMIT, &req->wb_flags); + req->wb_commit_lseg = get_lseg(lseg); + } + } +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags)) + return PNFS_NOT_ATTEMPTED; + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how); +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + struct list_head *rv; + + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) { + struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode; + + set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags); + rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); + /* matched by ref taken when PG_PNFS_COMMIT is set */ + put_lseg(req->wb_commit_lseg); + } else + rv = mds; + return rv; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ + if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) + put_lseg(req->wb_commit_lseg); +} + #else /* CONFIG_NFS_V4_1 */ static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) @@ -252,6 +306,31 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) pgio->pg_test = NULL; } +static inline void +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) +{ +} + +static inline int +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline struct list_head * +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) +{ + return mds; +} + +static inline void pnfs_clear_request_commit(struct nfs_page *req) +{ +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b8ec170f2a0..ac40b8535d7 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs_proc_lookup(struct inode *dir, struct qstr *name, +nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { struct nfs_diropargs arg = { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 47a3ad63e0d..85d75254328 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void) } return p; } +EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); void nfs_commit_free(struct nfs_write_data *p) { @@ -66,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { @@ -179,8 +181,8 @@ static int wb_priority(struct writeback_control *wbc) if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI; - return 0; + return FLUSH_LOWPRI | FLUSH_COND_STABLE; + return FLUSH_COND_STABLE; } /* @@ -441,7 +443,7 @@ nfs_mark_request_dirty(struct nfs_page *req) * Add a request to the inode's commit list. */ static void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); @@ -453,6 +455,7 @@ nfs_mark_request_commit(struct nfs_page *req) NFS_PAGE_TAG_COMMIT); nfsi->ncommit++; spin_unlock(&inode->i_lock); + pnfs_mark_request_commit(req, lseg); inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); @@ -474,14 +477,18 @@ nfs_clear_request_commit(struct nfs_page *req) static inline int nfs_write_need_commit(struct nfs_write_data *data) { - return data->verf.committed != NFS_FILE_SYNC; + if (data->verf.committed == NFS_DATA_SYNC) + return data->lseg == NULL; + else + return data->verf.committed != NFS_FILE_SYNC; } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { @@ -492,7 +499,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) } #else static inline void -nfs_mark_request_commit(struct nfs_page *req) +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg) { } @@ -509,7 +516,8 @@ int nfs_write_need_commit(struct nfs_write_data *data) } static inline -int nfs_reschedule_unstable_write(struct nfs_page *req) +int nfs_reschedule_unstable_write(struct nfs_page *req, + struct nfs_write_data *data) { return 0; } @@ -612,9 +620,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, } if (nfs_clear_request_commit(req) && - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, - req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) { NFS_I(inode)->ncommit--; + pnfs_clear_request_commit(req); + } /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -762,11 +772,12 @@ int nfs_updatepage(struct file *file, struct page *page, return status; } -static void nfs_writepage_release(struct nfs_page *req) +static void nfs_writepage_release(struct nfs_page *req, + struct nfs_write_data *data) { struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data)) nfs_inode_remove_request(req); nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); @@ -863,7 +874,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.context = get_nfs_open_context(req->wb_context); data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; - if (how & FLUSH_STABLE) { + if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { data->args.stable = NFS_DATA_SYNC; if (!nfs_need_commit(NFS_I(inode))) data->args.stable = NFS_FILE_SYNC; @@ -912,6 +923,12 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit || + desc->pg_count > wsize)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -1002,6 +1019,10 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) if ((!lseg) && list_is_singular(&data->pages)) lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); + if ((desc->pg_ioflags & FLUSH_COND_STABLE) && + (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) + desc->pg_ioflags &= ~FLUSH_COND_STABLE; + /* Set up the argument struct */ ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: @@ -1074,7 +1095,7 @@ static void nfs_writeback_release_partial(void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) - nfs_writepage_release(req); + nfs_writepage_release(req, data); nfs_writedata_release(calldata); } @@ -1141,7 +1162,7 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - nfs_mark_request_commit(req); + nfs_mark_request_commit(req, data->lseg); dprintk(" marked for commit\n"); goto next; } @@ -1251,57 +1272,82 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { + int ret; + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) return 1; - if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, nfs_wait_bit_killable, - TASK_KILLABLE)) - return 1; - return 0; + if (!may_wait) + return 0; + ret = out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + return (ret < 0) ? ret : 1; } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +void nfs_commit_clear_lock(struct nfs_inode *nfsi) { clear_bit(NFS_INO_COMMIT, &nfsi->flags); smp_mb__after_clear_bit(); wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); } +EXPORT_SYMBOL_GPL(nfs_commit_clear_lock); - -static void nfs_commitdata_release(void *data) +void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } +EXPORT_SYMBOL_GPL(nfs_commitdata_release); -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, - int how) +int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; - int priority = flush_task_priority(how); struct rpc_task *task; + int priority = flush_task_priority(how); struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = first->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, - .callback_ops = &nfs_commit_ops, + .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, .priority = priority, }; + /* Set up the initial task struct. */ + NFS_PROTO(data->inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs_initiate_commit); + +/* + * Set up the argument/result storage required for the RPC call. + */ +void nfs_init_commit(struct nfs_write_data *data, + struct list_head *head, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->path.dentry->d_inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1309,7 +1355,9 @@ static int nfs_commit_rpcsetup(struct list_head *head, list_splice_init(head, &data->pages); data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = first->wb_context->cred; + data->lseg = lseg; /* reference transferred */ + data->mds_ops = &nfs_commit_ops; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ @@ -1320,20 +1368,25 @@ static int nfs_commit_rpcsetup(struct list_head *head, data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); +} +EXPORT_SYMBOL_GPL(nfs_init_commit); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->commit_setup(data, &msg); - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +void nfs_retry_commit(struct list_head *page_list, + struct pnfs_layout_segment *lseg) +{ + struct nfs_page *req; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - if (how & FLUSH_SYNC) - rpc_wait_for_completion_task(task); - rpc_put_task(task); - return 0; + while (!list_empty(page_list)) { + req = nfs_list_entry(page_list->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req, lseg); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + nfs_clear_page_tag_locked(req); + } } +EXPORT_SYMBOL_GPL(nfs_retry_commit); /* * Commit dirty pages @@ -1342,7 +1395,6 @@ static int nfs_commit_list(struct inode *inode, struct list_head *head, int how) { struct nfs_write_data *data; - struct nfs_page *req; data = nfs_commitdata_alloc(); @@ -1350,17 +1402,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) goto out_bad; /* Set up the argument struct */ - return nfs_commit_rpcsetup(head, data, how); + nfs_init_commit(data, head, NULL); + return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how); out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_mark_request_commit(req); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); - } + nfs_retry_commit(head, NULL); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1380,10 +1425,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) return; } -static void nfs_commit_release(void *calldata) +void nfs_commit_release_pages(struct nfs_write_data *data) { - struct nfs_write_data *data = calldata; - struct nfs_page *req; + struct nfs_page *req; int status = data->task.tk_status; while (!list_empty(&data->pages)) { @@ -1417,6 +1461,14 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } +} +EXPORT_SYMBOL_GPL(nfs_commit_release_pages); + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + + nfs_commit_release_pages(data); nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } @@ -1433,23 +1485,30 @@ int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; - int res = 0; + int res; - if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + res = nfs_commit_set_lock(NFS_I(inode), may_wait); + if (res <= 0) goto out_mark_dirty; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error; + + error = pnfs_commit_list(inode, &head, how); + if (error == PNFS_NOT_ATTEMPTED) + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - if (may_wait) - wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - else + if (!may_wait) goto out_mark_dirty; + error = wait_on_bit(&NFS_I(inode)->flags, + NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + if (error < 0) + return error; } else nfs_commit_clear_lock(NFS_I(inode)); return res; @@ -1503,7 +1562,22 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return nfs_commit_unstable_pages(inode, wbc); + int ret; + + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { + int status; + bool sync = true; + + if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking || + wbc->for_background) + sync = false; + + status = pnfs_layoutcommit_inode(inode, sync); + if (status < 0) + return status; + } + return ret; } /* diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index 84c27d69d42..ec0f277be7f 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -117,7 +117,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, * invoked in contexts where a memory allocation failure is * fatal. Fortunately this fake ACL is small enough to * construct on the stack. */ - memset(acl2, 0, sizeof(acl2)); posix_acl_init(acl2, 4); /* Insert entries in canonical order: other orders seem diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 4c29fcf557d..07ea8d3e6ea 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -22,13 +22,14 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#include "../internal.h" + /* * Recalculate the mask of events relevant to a given inode locked. */ @@ -237,15 +238,14 @@ out: * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @list: list of inodes being unmounted (sb->s_inodes) * - * Called with inode_lock held, protecting the unmounting super block's list - * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. - * We temporarily drop inode_lock, however, and CAN block. + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. */ void fsnotify_unmount_inodes(struct list_head *list) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry_safe(inode, next_i, list, i_sb_list) { struct inode *need_iput_tmp; @@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list) * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); continue; + } /* * If i_count is zero, the inode cannot have any watches and @@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list) * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ - if (!atomic_read(&inode->i_count)) + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); continue; + } need_iput_tmp = need_iput; need_iput = NULL; @@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list) __iget(inode); else need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count) && - !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { - __iget(next_i); - need_iput = next_i; + atomic_read(&next_i->i_count)) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + spin_unlock(&next_i->i_lock); } /* - * We can safely drop inode_lock here because we hold + * We can safely drop inode_sb_list_lock here because we hold * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. Finally, - * iprune_mutex keeps shrink_icache_memory() away. + * will be added since the umount has begun. */ - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 325185e514b..50c00856f73 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,7 +91,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/srcu.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 85eebff6d0d..e86577d6c5c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -23,7 +23,6 @@ #include <linux/mount.h> #include <linux/mutex.h> #include <linux/spinlock.h> -#include <linux/writeback.h> /* for inode_lock */ #include <asm/atomic.h> diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index a627ed82c0a..0b56c6b7ec0 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -54,7 +54,7 @@ * * Return 1 if the attributes match and 0 if not. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. */ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) @@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na) * * Return 0 on success and -errno on error. * - * NOTE: This function runs with the inode_lock spin lock held so it is not + * NOTE: This function runs with the inode->i_lock spin lock held so it is not * allowed to sleep. (Hence the GFP_ATOMIC allocation.) */ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7c708a418ac..2e7addfd980 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -182,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v) struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - vma_stop(priv, vma); + if (!IS_ERR(vma)) + vma_stop(priv, vma); if (priv->task) put_task_struct(priv->task); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a2a622e079f..fcc8ae75d87 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -76,7 +76,7 @@ #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/quotaops.h> -#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ +#include "../internal.h" /* ugh */ #include <asm/uaccess.h> @@ -900,33 +900,38 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); continue; + } #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - if (!atomic_read(&inode->i_writecount)) - continue; - if (!dqinit_needed(inode, type)) - continue; - __iget(inode); - spin_unlock(&inode_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); __dquot_initialize(inode, type); - /* We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the inode_lock. - * We cannot iput the inode now as we can be holding the last - * reference and we cannot iput it under inode_lock. So we - * keep the reference and iput it later. */ + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * inode_sb_list_lock We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * inode_sb_list_lock. So we keep the reference and iput it + * later. + */ old_inode = inode; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1007,7 +1012,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_lock); + spin_lock(&inode_sb_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1021,7 +1026,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, remove_inode_dquot_ref(inode, type, tofree_head); } } - spin_unlock(&inode_lock); + spin_unlock(&inode_sb_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" |