diff options
Diffstat (limited to 'fs/inode.c')
| -rw-r--r-- | fs/inode.c | 303 |
1 files changed, 169 insertions, 134 deletions
diff --git a/fs/inode.c b/fs/inode.c index d6dfb09c828..6eecb7ff0b9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -17,6 +17,7 @@ #include <linux/prefetch.h> #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> +#include <linux/list_lru.h> #include "internal.h" /* @@ -24,7 +25,7 @@ * * inode->i_lock protects: * inode->i_state, inode->i_hash, __iget() - * inode->i_sb->s_inode_lru_lock protects: + * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list @@ -37,7 +38,7 @@ * * inode_sb_list_lock * inode->i_lock - * inode->i_sb->s_inode_lru_lock + * Inode LRU list locks * * bdi->wb.list_lock * inode->i_lock @@ -70,33 +71,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -104,12 +105,12 @@ int get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(ctl_table *table, int write, +int proc_nr_inodes(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif @@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold); static void inode_lru_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (list_empty(&inode->i_lru)) { - list_add(&inode->i_lru, &inode->i_sb->s_inode_lru); - inode->i_sb->s_nr_inodes_unused++; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /* @@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode) static void inode_lru_list_del(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_lru_lock); - if (!list_empty(&inode->i_lru)) { - list_del_init(&inode->i_lru); - inode->i_sb->s_nr_inodes_unused--; + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); - } - spin_unlock(&inode->i_sb->s_inode_lru_lock); } /** @@ -511,6 +503,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrshadows); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -556,8 +549,7 @@ static void evict(struct inode *inode) if (op->evict_inode) { op->evict_inode(inode); } else { - if (inode->i_data.nrpages) - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } if (S_ISBLK(inode->i_mode) && inode->i_bdev) @@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) return busy; } -static int can_unuse(struct inode *inode) -{ - if (inode->i_state & ~I_REFERENCED) - return 0; - if (inode_has_buffers(inode)) - return 0; - if (atomic_read(&inode->i_count)) - return 0; - if (inode->i_data.nrpages) - return 0; - return 1; -} - /* - * Walk the superblock inode LRU for freeable inodes and attempt to free them. - * This is called from the superblock shrinker function with a number of inodes - * to trim from the LRU. Inodes to be freed are moved to a temporary list and - * then are freed outside inode_lock by dispose_list(). + * Isolate the inode from the LRU in preparation for freeing it. * * Any inodes which are pinned purely because of attached pagecache have their * pagecache removed. If the inode has metadata buffers attached to @@ -706,89 +682,82 @@ static int can_unuse(struct inode *inode) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -void prune_icache_sb(struct super_block *sb, int nr_to_scan) +static enum lru_status +inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) { - LIST_HEAD(freeable); - int nr_scanned; - unsigned long reap = 0; + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); - spin_lock(&sb->s_inode_lru_lock); - for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) { - struct inode *inode; + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; - if (list_empty(&sb->s_inode_lru)) - break; + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } - inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru); + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } - /* - * we are inverting the sb->s_inode_lru_lock/inode->i_lock here, - * so use a trylock. If we fail to get the lock, just move the - * inode to the back of the list so we don't spin on it. - */ - if (!spin_trylock(&inode->i_lock)) { - list_move(&inode->i_lru, &sb->s_inode_lru); - continue; + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; } + iput(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } - /* - * Referenced or dirty inodes are still in use. Give them - * another pass through the LRU as we canot reclaim them now. - */ - if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); - spin_unlock(&inode->i_lock); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - continue; - } + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_move(&inode->i_lru, freeable); + spin_unlock(&inode->i_lock); - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { - inode->i_state &= ~I_REFERENCED; - list_move(&inode->i_lru, &sb->s_inode_lru); - spin_unlock(&inode->i_lock); - continue; - } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_lru_lock); - if (remove_inode_buffers(inode)) - reap += invalidate_mapping_pages(&inode->i_data, - 0, -1); - iput(inode); - spin_lock(&sb->s_inode_lru_lock); - - if (inode != list_entry(sb->s_inode_lru.next, - struct inode, i_lru)) - continue; /* wrong inode or list_empty */ - /* avoid lock inversions with trylock */ - if (!spin_trylock(&inode->i_lock)) - continue; - if (!can_unuse(inode)) { - spin_unlock(&inode->i_lock); - continue; - } - } - WARN_ON(inode->i_state & I_NEW); - inode->i_state |= I_FREEING; - spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} - list_move(&inode->i_lru, &freeable); - sb->s_nr_inodes_unused--; - this_cpu_dec(nr_unused); - } - if (current_is_kswapd()) - __count_vm_events(KSWAPD_INODESTEAL, reap); - else - __count_vm_events(PGINODESTEAL, reap); - spin_unlock(&sb->s_inode_lru_lock); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += reap; +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, + int nid) +{ + LIST_HEAD(freeable); + long freed; + freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, + &freeable, &nr_to_scan); dispose_list(&freeable); + return freed; } static void __wait_on_freeing_inode(struct inode *inode); @@ -804,15 +773,11 @@ static struct inode *find_inode(struct super_block *sb, repeat: hlist_for_each_entry(inode, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + if (inode->i_sb != sb) continue; - } - if (!test(inode, data)) { - spin_unlock(&inode->i_lock); + if (!test(inode, data)) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -835,15 +800,11 @@ static struct inode *find_inode_fast(struct super_block *sb, repeat: hlist_for_each_entry(inode, head, i_hash) { - spin_lock(&inode->i_lock); - if (inode->i_ino != ino) { - spin_unlock(&inode->i_lock); + if (inode->i_ino != ino) continue; - } - if (inode->i_sb != sb) { - spin_unlock(&inode->i_lock); + if (inode->i_sb != sb) continue; - } + spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; @@ -982,6 +943,41 @@ void unlock_new_inode(struct inode *inode) EXPORT_SYMBOL(unlock_new_inode); /** + * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + */ +void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_lock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); +} +EXPORT_SYMBOL(lock_two_nondirectories); + +/** + * unlock_two_nondirectories - release locks from lock_two_nondirectories() + * @inode1: first inode to unlock + * @inode2: second inode to unlock + */ +void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 && !S_ISDIR(inode1->i_mode)) + mutex_unlock(&inode1->i_mutex); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); +} +EXPORT_SYMBOL(unlock_two_nondirectories); + +/** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get @@ -1525,7 +1521,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(struct path *path) +void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = path->dentry->d_inode; @@ -1606,7 +1602,11 @@ static int __remove_suid(struct dentry *dentry, int kill) struct iattr newattrs; newattrs.ia_valid = ATTR_FORCE | kill; - return notify_change(dentry, &newattrs); + /* + * Note we call this on write, so notify_change will not + * encounter any conflicting delegations: + */ + return notify_change(dentry, &newattrs, NULL); } int file_remove_suid(struct file *file) @@ -1839,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner); * inode_owner_or_capable - check current task permissions to inode * @inode: inode being checked * - * Return true if current either has CAP_FOWNER to the inode, or - * owns the file. + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. */ bool inode_owner_or_capable(const struct inode *inode) { + struct user_namespace *ns; + if (uid_eq(current_fsuid(), inode->i_uid)) return true; - if (inode_capable(inode, CAP_FOWNER)) + + ns = current_user_ns(); + if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid)) return true; return false; } @@ -1898,3 +1902,34 @@ void inode_dio_done(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } EXPORT_SYMBOL(inode_dio_done); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today --- for example, + * __generic_file_aio_write() calls file_remove_suid() without holding + * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + unsigned int old_flags, new_flags; + + WARN_ON_ONCE(flags & ~mask); + do { + old_flags = ACCESS_ONCE(inode->i_flags); + new_flags = (old_flags & ~mask) | flags; + } while (unlikely(cmpxchg(&inode->i_flags, old_flags, + new_flags) != old_flags)); +} +EXPORT_SYMBOL(inode_set_flags); |
