diff options
Diffstat (limited to 'fs/inode.c')
| -rw-r--r-- | fs/inode.c | 596 |
1 files changed, 384 insertions, 212 deletions
diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad..6eecb7ff0b9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2,31 +2,22 @@ * (C) 1997 Linus Torvalds * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation) */ +#include <linux/export.h> #include <linux/fs.h> #include <linux/mm.h> -#include <linux/dcache.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/writeback.h> -#include <linux/module.h> #include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/rwsem.h> #include <linux/hash.h> #include <linux/swap.h> #include <linux/security.h> -#include <linux/pagemap.h> #include <linux/cdev.h> #include <linux/bootmem.h> #include <linux/fsnotify.h> #include <linux/mount.h> -#include <linux/async.h> #include <linux/posix_acl.h> #include <linux/prefetch.h> -#include <linux/ima.h> -#include <linux/cred.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" /* @@ -34,7 +25,7 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode->i_sb->s_inode_lru_lock protects: + * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list @@ -47,7 +38,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode->i_sb->s_inode_lru_lock + * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock @@ -80,33 +71,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -114,12 +105,12 @@ int get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(ctl_table *table, int write, +int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -145,8 +136,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fop = &empty_fops; inode->__i_nlink = 1; inode->i_opflags = 0; - inode->i_uid = 0; - inode->i_gid = 0; + i_uid_write(inode, 0); + i_gid_write(inode, 0); atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_blocks = 0; @@ -175,7 +166,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); - mapping->assoc_mapping = NULL; + mapping->private_data = NULL; mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; @@ -192,7 +183,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) } inode->i_private = NULL; inode->i_mapping = mapping; - INIT_LIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ + INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; #endif @@ -343,8 +334,10 @@ EXPORT_SYMBOL(set_nlink); */ void inc_nlink(struct inode *inode) { - if (WARN_ON(inode->i_nlink == 0)) + if (unlikely(inode->i_nlink == 0)) { + WARN_ON(!(inode->i_state & I_LINKABLE)); atomic_long_dec(&inode->i_sb->s_remove_count); + } inode->__i_nlink++; } @@ -358,7 +351,7 @@ void address_space_init_once(struct address_space *mapping) mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + mapping->i_mmap = RB_ROOT; INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); @@ -409,24 +402,28 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); - inode->i_sb->s_nr_inodes_unused++; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + inode_lru_list_add(inode); +} + + static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (!list_empty(&inode->i_lru)) { - list_del_init(&inode->i_lru); - inode->i_sb->s_nr_inodes_unused--; + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -496,7 +493,7 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); -void end_writeback(struct inode *inode) +void clear_inode(struct inode *inode) { might_sleep(); /* @@ -506,15 +503,15 @@ void end_writeback(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); - inode_sync_wait(inode); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } -EXPORT_SYMBOL(end_writeback); +EXPORT_SYMBOL(clear_inode); /* * Free the inode passed in, removing it from the lists it is still connected @@ -541,12 +538,19 @@ static void evict(struct inode *inode) inode_sb_list_del(inode); + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); + if (op->evict_inode) { op->evict_inode(inode); } else { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); - end_writeback(inode); + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) bd_forget(inode); @@ -663,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) return busy; } -static int can_unuse(struct inode *inode) -{ - if (inode->i_state & ~I_REFERENCED) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (atomic_read(&inode->i_count)) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* - * Walk the superblock inode LRU for freeable inodes and attempt to free them. - * This is called from the superblock shrinker function with a number of inodes - * to trim from the LRU. Inodes to be freed are moved to a temporary list and - * then are freed outside inode_lock by dispose_list(). + * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -694,89 +682,82 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -void prune_icache_sb(struct super_block *sb, int nr_to_scan) +static enum lru_status +inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { - LIST_HEAD(freeable); - int nr_scanned; - unsigned long reap = 0; + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); - spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { - struct inode *inode; + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; - if (list_empty(&sb->s_inode_lru)) - break; + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } - inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } - /* - * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, - * so use a trylock. If we fail to get the lock, just move the - * inode to the back of the list so we don't spin on it. - */ - if (!spin_trylock(&inode->i_lock)) { - list_move_tail(&inode->i_lru, &sb->s_inode_lru); - continue; + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } - /* - * Referenced or dirty inodes are still in use. Give them - * another pass through the LRU as we canot reclaim them now. - */ - if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); - spin_unlock(&inode->i_lock); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - continue; - } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_move(&inode->i_lru, freeable); + spin_unlock(&inode->i_lock); - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &sb->s_inode_lru); - spin_unlock(&inode->i_lock); - continue; - } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_lru_lock); - if (remove_inode_buffers(inode)) - reap += invalidate_mapping_pages(&inode->i_data, - 0, -1); - iput(inode); - spin_lock(&sb->s_inode_lru_lock); - - if (inode != list_entry(sb->s_inode_lru.next, - struct inode, i_lru)) - continue; /* wrong inode or list_empty */ - /* avoid lock inversions with trylock */ - if (!spin_trylock(&inode->i_lock)) - continue; - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } - } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} - list_move(&inode->i_lru, &freeable); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - } - if (current_is_kswapd()) - __count_vm_events(KSWAPD_INODESTEAL, reap); - else - __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&sb->s_inode_lru_lock); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) +{ + LIST_HEAD(freeable); + long freed; + freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, + &freeable, &nr_to_scan); dispose_list(&freeable); + return freed; } static void __wait_on_freeing_inode(struct inode *inode); @@ -788,20 +769,15 @@ static struct inode *find_inode(struct super_block *sb, int (*test)(struct inode *, void *), void *data) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) continue; - } - if (!test(inode, data)) { - spin_unlock(&inode->i_lock); + if (!test(inode, data)) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -820,20 +796,15 @@ repeat: static struct inode *find_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { - struct hlist_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, node, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_ino != ino) { - spin_unlock(&inode->i_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_ino != ino) continue; - } - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + if (inode->i_sb != sb) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -938,8 +909,7 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ - if (!lockdep_match_class(&inode->i_mutex, - &type->i_mutex_key)) { + if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ @@ -966,12 +936,48 @@ void unlock_new_inode(struct inode *inode) spin_lock(&inode->i_lock); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW; + smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(unlock_new_inode); /** + * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + */ +void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_lock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); +} +EXPORT_SYMBOL(lock_two_nondirectories); + +/** + * unlock_two_nondirectories - release locks from lock_two_nondirectories() + * @inode1: first inode to unlock + * @inode2: second inode to unlock + */ +void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_unlock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); +} +EXPORT_SYMBOL(unlock_two_nondirectories); + +/** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get @@ -1122,11 +1128,10 @@ EXPORT_SYMBOL(iget_locked); static int test_inode_iunique(struct super_block *sb, unsigned long ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); - struct hlist_node *node; struct inode *inode; spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, node, b, i_hash) { + hlist_for_each_entry(inode, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) { spin_unlock(&inode_hash_lock); return 0; @@ -1281,10 +1286,9 @@ int insert_inode_locked(struct inode *inode) struct hlist_head *head = inode_hashtable + hash(sb, ino); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_ino != ino) continue; if (old->i_sb != sb) @@ -1296,7 +1300,7 @@ int insert_inode_locked(struct inode *inode) } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); @@ -1324,11 +1328,10 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); while (1) { - struct hlist_node *node; struct inode *old = NULL; spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, node, head, i_hash) { + hlist_for_each_entry(old, head, i_hash) { if (old->i_sb != sb) continue; if (!test(old, data)) @@ -1340,7 +1343,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, } break; } - if (likely(!node)) { + if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW; hlist_add_head(&inode->i_hash, head); @@ -1369,17 +1372,6 @@ int generic_delete_inode(struct inode *inode) EXPORT_SYMBOL(generic_delete_inode); /* - * Normal UNIX filesystem behaviour: delete the - * inode when the usage count drops to zero, and - * i_nlink is zero. - */ -int generic_drop_inode(struct inode *inode) -{ - return !inode->i_nlink || inode_unhashed(inode); -} -EXPORT_SYMBOL_GPL(generic_drop_inode); - -/* * Called when we're dropping the last reference * to an inode. * @@ -1404,8 +1396,7 @@ static void iput_final(struct inode *inode) if (!drop && (sb->s_flags & MS_ACTIVE)) { inode->i_state |= I_REFERENCED; - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - inode_lru_list_add(inode); + inode_add_lru(inode); spin_unlock(&inode->i_lock); return; } @@ -1501,18 +1492,39 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + if (inode->i_op->update_time) + return inode->i_op->update_time(inode, time, flags); + + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_VERSION) + inode_inc_iversion(inode); + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + mark_inode_dirty_sync(inode); + return 0; +} + /** * touch_atime - update the access time - * @mnt: mount the inode is accessed on - * @dentry: dentry accessed + * @path: the &struct path to update * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct vfsmount *mnt, struct dentry *dentry) +void touch_atime(const struct path *path) { - struct inode *inode = dentry->d_inode; + struct vfsmount *mnt = path->mnt; + struct inode *inode = path->dentry->d_inode; struct timespec now; if (inode->i_flags & S_NOATIME) @@ -1535,15 +1547,96 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) if (timespec_equal(&inode->i_atime, &now)) return; - if (mnt_want_write(mnt)) + if (!sb_start_write_trylock(inode->i_sb)) return; - inode->i_atime = now; - mark_inode_dirty_sync(inode); - mnt_drop_write(mnt); + if (__mnt_want_write(mnt)) + goto skip_update; + /* + * File systems can error out when updating inodes if they need to + * allocate new space to modify an inode (such is the case for + * Btrfs), but since we touch atime while walking down the path we + * really don't care if we failed to update the atime of the file, + * so just ignore the return value. + * We may also fail on filesystems that have the ability to make parts + * of the fs read only, e.g. subvolumes in Btrfs. + */ + update_time(inode, &now, S_ATIME); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); +/* + * The logic we want is + * + * if suid or (sgid and xgrp) + * remove privs + */ +int should_remove_suid(struct dentry *dentry) +{ + umode_t mode = dentry->d_inode->i_mode; + int kill = 0; + + /* suid always must be killed */ + if (unlikely(mode & S_ISUID)) + kill = ATTR_KILL_SUID; + + /* + * sgid without any exec bits is just a mandatory locking mark; leave + * it alone. If some exec bits are set, it's a real sgid; kill it. + */ + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + kill |= ATTR_KILL_SGID; + + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) + return kill; + + return 0; +} +EXPORT_SYMBOL(should_remove_suid); + +static int __remove_suid(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + /* + * Note we call this on write, so notify_change will not + * encounter any conflicting delegations: + */ + return notify_change(dentry, &newattrs, NULL); +} + +int file_remove_suid(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + int killsuid; + int killpriv; + int error = 0; + + /* Fast path for nothing security related */ + if (IS_NOSEC(inode)) + return 0; + + killsuid = should_remove_suid(dentry); + killpriv = security_inode_need_killpriv(dentry); + + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) + inode->i_flags |= S_NOSEC; + + return error; +} +EXPORT_SYMBOL(file_remove_suid); + /** * file_update_time - update mtime and ctime time * @file: file accessed @@ -1553,18 +1646,20 @@ EXPORT_SYMBOL(touch_atime); * usage in the file write path of filesystems, and filesystems may * choose to explicitly ignore update via this function with the * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. + * timestamps are handled by the server. This can return an error for + * file systems who need to allocate space in order to update an inode. */ -void file_update_time(struct file *file) +int file_update_time(struct file *file) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); struct timespec now; - enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0; + int sync_it = 0; + int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) - return; + return 0; now = current_fs_time(inode->i_sb); if (!timespec_equal(&inode->i_mtime, &now)) @@ -1577,21 +1672,16 @@ void file_update_time(struct file *file) sync_it |= S_VERSION; if (!sync_it) - return; + return 0; /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) - return; + if (__mnt_want_write_file(file)) + return 0; - /* Only change inode inside the lock region */ - if (sync_it & S_VERSION) - inode_inc_iversion(inode); - if (sync_it & S_CTIME) - inode->i_ctime = now; - if (sync_it & S_MTIME) - inode->i_mtime = now; - mark_inode_dirty_sync(inode); - mnt_drop_write_file(file); + ret = update_time(inode, &now, sync_it); + __mnt_drop_write_file(file); + + return ret; } EXPORT_SYMBOL(file_update_time); @@ -1651,7 +1741,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1667,15 +1757,16 @@ void __init inode_init_early(void) HASH_EARLY, &i_hash_shift, &i_hash_mask, + 0, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1697,9 +1788,10 @@ void __init inode_init(void) 0, &i_hash_shift, &i_hash_mask, + 0, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } @@ -1713,7 +1805,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) - inode->i_fop = &def_fifo_fops; + inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) inode->i_fop = &bad_sock_fops; else @@ -1747,17 +1839,97 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { - struct user_namespace *ns = inode_userns(inode); + struct user_namespace *ns; - if (current_user_ns() == ns && current_fsuid() == inode->i_uid) + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (ns_capable(ns, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } EXPORT_SYMBOL(inode_owner_or_capable); + +/* + * Direct i/o helper functions + */ +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wait); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL(inode_dio_wait); + +/* + * inode_dio_done - signal finish of a direct I/O requests + * @inode: inode the direct I/O happens on + * + * This is called once we've finished processing a direct I/O request, + * and is used to wake up callers waiting for direct I/O to be quiesced. + */ +void inode_dio_done(struct inode *inode) +{ + if (atomic_dec_and_test(&inode->i_dio_count)) + wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); +} +EXPORT_SYMBOL(inode_dio_done); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); |
