diff options
Diffstat (limited to 'fs/file_table.c')
| -rw-r--r-- | fs/file_table.c | 375 | 
1 files changed, 103 insertions, 272 deletions
diff --git a/fs/file_table.c b/fs/file_table.c index c3dee381f1b..385bfd31512 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -23,9 +23,11 @@  #include <linux/lglock.h>  #include <linux/percpu_counter.h>  #include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/task_work.h>  #include <linux/ima.h> -#include <asm/atomic.h> +#include <linux/atomic.h>  #include "internal.h" @@ -34,15 +36,12 @@ struct files_stat_struct files_stat = {  	.max_files = NR_FILE  }; -DECLARE_LGLOCK(files_lglock); -DEFINE_LGLOCK(files_lglock); -  /* SLAB cache for file structures */  static struct kmem_cache *filp_cachep __read_mostly;  static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void file_free_rcu(struct rcu_head *head)  {  	struct file *f = container_of(head, struct file, f_u.fu_rcuhead); @@ -53,7 +52,6 @@ static inline void file_free_rcu(struct rcu_head *head)  static inline void file_free(struct file *f)  {  	percpu_counter_dec(&nr_files); -	file_check_state(f);  	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);  } @@ -78,14 +76,14 @@ EXPORT_SYMBOL_GPL(get_max_files);   * Handle nr_files sysctl   */  #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write,                       void __user *buffer, size_t *lenp, loff_t *ppos)  {  	files_stat.nr_files = get_nr_files();  	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);  }  #else -int proc_nr_files(ctl_table *table, int write, +int proc_nr_files(struct ctl_table *table, int write,                       void __user *buffer, size_t *lenp, loff_t *ppos)  {  	return -ENOSYS; @@ -93,8 +91,8 @@ int proc_nr_files(ctl_table *table, int write,  #endif  /* Find an unused file structure and return a pointer to it. - * Returns NULL, if there are no more free file structures or - * we run out of memory. + * Returns an error pointer if some error happend e.g. we over file + * structures limit, run out of memory or operation is not permitted.   *   * Be very careful using this.  You are responsible for   * getting write access to any mount that you might assign @@ -106,7 +104,8 @@ struct file *get_empty_filp(void)  {  	const struct cred *cred = current_cred();  	static long old_max; -	struct file * f; +	struct file *f; +	int error;  	/*  	 * Privileged users can go above max_files @@ -121,18 +120,21 @@ struct file *get_empty_filp(void)  	}  	f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); -	if (f == NULL) -		goto fail; +	if (unlikely(!f)) +		return ERR_PTR(-ENOMEM);  	percpu_counter_inc(&nr_files); -	if (security_file_alloc(f)) -		goto fail_sec; +	f->f_cred = get_cred(cred); +	error = security_file_alloc(f); +	if (unlikely(error)) { +		file_free(f); +		return ERR_PTR(error); +	} -	INIT_LIST_HEAD(&f->f_u.fu_list);  	atomic_long_set(&f->f_count, 1);  	rwlock_init(&f->f_owner.lock); -	f->f_cred = get_cred(cred);  	spin_lock_init(&f->f_lock); +	mutex_init(&f->f_pos_lock);  	eventpoll_init_file(f);  	/* f->f_version: 0 */  	return f; @@ -143,12 +145,7 @@ over:  		pr_info("VFS: file-max limit %lu reached\n", get_max_files());  		old_max = get_nr_files();  	} -	goto fail; - -fail_sec: -	file_free(f); -fail: -	return NULL; +	return ERR_PTR(-ENFILE);  }  /** @@ -172,61 +169,33 @@ struct file *alloc_file(struct path *path, fmode_t mode,  	struct file *file;  	file = get_empty_filp(); -	if (!file) -		return NULL; +	if (IS_ERR(file)) +		return file;  	file->f_path = *path; +	file->f_inode = path->dentry->d_inode;  	file->f_mapping = path->dentry->d_inode->i_mapping; +	if ((mode & FMODE_READ) && +	     likely(fop->read || fop->aio_read || fop->read_iter)) +		mode |= FMODE_CAN_READ; +	if ((mode & FMODE_WRITE) && +	     likely(fop->write || fop->aio_write || fop->write_iter)) +		mode |= FMODE_CAN_WRITE;  	file->f_mode = mode;  	file->f_op = fop; - -	/* -	 * These mounts don't really matter in practice -	 * for r/o bind mounts.  They aren't userspace- -	 * visible.  We do this for consistency, and so -	 * that we can do debugging checks at __fput() -	 */ -	if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { -		file_take_write(file); -		WARN_ON(mnt_clone_write(path->mnt)); -	} -	ima_counts_get(file); +	if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) +		i_readcount_inc(path->dentry->d_inode);  	return file;  }  EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -void drop_file_write_access(struct file *file) -{ -	struct vfsmount *mnt = file->f_path.mnt; -	struct dentry *dentry = file->f_path.dentry; -	struct inode *inode = dentry->d_inode; - -	put_write_access(inode); - -	if (special_file(inode->i_mode)) -		return; -	if (file_check_writeable(file) != 0) -		return; -	mnt_drop_write(mnt); -	file_release_write(file); -} -EXPORT_SYMBOL_GPL(drop_file_write_access); -  /* the real guts of fput() - releasing the last reference to file   */  static void __fput(struct file *file)  {  	struct dentry *dentry = file->f_path.dentry;  	struct vfsmount *mnt = file->f_path.mnt; -	struct inode *inode = dentry->d_inode; +	struct inode *inode = file->f_inode;  	might_sleep(); @@ -236,252 +205,116 @@ static void __fput(struct file *file)  	 * in the file cleanup chain.  	 */  	eventpoll_release(file); -	locks_remove_flock(file); +	locks_remove_file(file);  	if (unlikely(file->f_flags & FASYNC)) { -		if (file->f_op && file->f_op->fasync) +		if (file->f_op->fasync)  			file->f_op->fasync(-1, file, 0);  	} -	if (file->f_op && file->f_op->release) +	ima_file_free(file); +	if (file->f_op->release)  		file->f_op->release(inode, file);  	security_file_free(file); -	ima_file_free(file); -	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) +	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && +		     !(file->f_mode & FMODE_PATH))) {  		cdev_put(inode->i_cdev); +	}  	fops_put(file->f_op);  	put_pid(file->f_owner.pid); -	file_sb_list_del(file); -	if (file->f_mode & FMODE_WRITE) -		drop_file_write_access(file); +	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) +		i_readcount_dec(inode); +	if (file->f_mode & FMODE_WRITER) { +		put_write_access(inode); +		__mnt_drop_write(mnt); +	}  	file->f_path.dentry = NULL;  	file->f_path.mnt = NULL; +	file->f_inode = NULL;  	file_free(file);  	dput(dentry);  	mntput(mnt);  } -void fput(struct file *file) +static LLIST_HEAD(delayed_fput_list); +static void delayed_fput(struct work_struct *unused)  { -	if (atomic_long_dec_and_test(&file->f_count)) -		__fput(file); -} - -EXPORT_SYMBOL(fput); +	struct llist_node *node = llist_del_all(&delayed_fput_list); +	struct llist_node *next; -struct file *fget(unsigned int fd) -{ -	struct file *file; -	struct files_struct *files = current->files; - -	rcu_read_lock(); -	file = fcheck_files(files, fd); -	if (file) { -		if (!atomic_long_inc_not_zero(&file->f_count)) { -			/* File object ref couldn't be taken */ -			rcu_read_unlock(); -			return NULL; -		} +	for (; node; node = next) { +		next = llist_next(node); +		__fput(llist_entry(node, struct file, f_u.fu_llist));  	} -	rcu_read_unlock(); - -	return file;  } -EXPORT_SYMBOL(fget); - -/* - * Lightweight file lookup - no refcnt increment if fd table isn't shared. - * - * You can use this instead of fget if you satisfy all of the following - * conditions: - * 1) You must call fput_light before exiting the syscall and returning control - *    to userspace (i.e. you cannot remember the returned struct file * after - *    returning to userspace). - * 2) You must not call filp_close on the returned struct file * in between - *    calls to fget_light and fput_light. - * 3) You must not clone the current task in between the calls to fget_light - *    and fput_light. - * - * The fput_needed flag returned by fget_light should be passed to the - * corresponding fput_light. - */ -struct file *fget_light(unsigned int fd, int *fput_needed) +static void ____fput(struct callback_head *work)  { -	struct file *file; -	struct files_struct *files = current->files; - -	*fput_needed = 0; -	if (likely((atomic_read(&files->count) == 1))) { -		file = fcheck_files(files, fd); -	} else { -		rcu_read_lock(); -		file = fcheck_files(files, fd); -		if (file) { -			if (atomic_long_inc_not_zero(&file->f_count)) -				*fput_needed = 1; -			else -				/* Didn't get the reference, someone's freed */ -				file = NULL; -		} -		rcu_read_unlock(); -	} - -	return file; +	__fput(container_of(work, struct file, f_u.fu_rcuhead));  } -void put_filp(struct file *file) -{ -	if (atomic_long_dec_and_test(&file->f_count)) { -		security_file_free(file); -		file_sb_list_del(file); -		file_free(file); -	} -} - -static inline int file_list_cpu(struct file *file) +/* + * If kernel thread really needs to have the final fput() it has done + * to complete, call this.  The only user right now is the boot - we + * *do* need to make sure our writes to binaries on initramfs has + * not left us with opened struct file waiting for __fput() - execve() + * won't work without that.  Please, don't add more callers without + * very good reasons; in particular, never call that with locks + * held and never call that from a thread that might need to do + * some work on any kind of umount. + */ +void flush_delayed_fput(void)  { -#ifdef CONFIG_SMP -	return file->f_sb_list_cpu; -#else -	return smp_processor_id(); -#endif +	delayed_fput(NULL);  } -/* helper for file_sb_list_add to reduce ifdefs */ -static inline void __file_sb_list_add(struct file *file, struct super_block *sb) -{ -	struct list_head *list; -#ifdef CONFIG_SMP -	int cpu; -	cpu = smp_processor_id(); -	file->f_sb_list_cpu = cpu; -	list = per_cpu_ptr(sb->s_files, cpu); -#else -	list = &sb->s_files; -#endif -	list_add(&file->f_u.fu_list, list); -} +static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); -/** - * file_sb_list_add - add a file to the sb's file list - * @file: file to add - * @sb: sb to add it to - * - * Use this function to associate a file with the superblock of the inode it - * refers to. - */ -void file_sb_list_add(struct file *file, struct super_block *sb) +void fput(struct file *file)  { -	lg_local_lock(files_lglock); -	__file_sb_list_add(file, sb); -	lg_local_unlock(files_lglock); -} +	if (atomic_long_dec_and_test(&file->f_count)) { +		struct task_struct *task = current; + +		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { +			init_task_work(&file->f_u.fu_rcuhead, ____fput); +			if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) +				return; +			/* +			 * After this task has run exit_task_work(), +			 * task_work_add() will fail.  Fall through to delayed +			 * fput to avoid leaking *file. +			 */ +		} -/** - * file_sb_list_del - remove a file from the sb's file list - * @file: file to remove - * @sb: sb to remove it from - * - * Use this function to remove a file from its superblock. - */ -void file_sb_list_del(struct file *file) -{ -	if (!list_empty(&file->f_u.fu_list)) { -		lg_local_lock_cpu(files_lglock, file_list_cpu(file)); -		list_del_init(&file->f_u.fu_list); -		lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); +		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list)) +			schedule_delayed_work(&delayed_fput_work, 1);  	}  } -#ifdef CONFIG_SMP -  /* - * These macros iterate all files on all CPUs for a given superblock. - * files_lglock must be held globally. + * synchronous analog of fput(); for kernel threads that might be needed + * in some umount() (and thus can't use flush_delayed_fput() without + * risking deadlocks), need to wait for completion of __fput() and know + * for this specific struct file it won't involve anything that would + * need them.  Use only if you really need it - at the very least, + * don't blindly convert fput() by kernel thread to that.   */ -#define do_file_list_for_each_entry(__sb, __file)		\ -{								\ -	int i;							\ -	for_each_possible_cpu(i) {				\ -		struct list_head *list;				\ -		list = per_cpu_ptr((__sb)->s_files, i);		\ -		list_for_each_entry((__file), list, f_u.fu_list) - -#define while_file_list_for_each_entry				\ -	}							\ -} - -#else - -#define do_file_list_for_each_entry(__sb, __file)		\ -{								\ -	struct list_head *list;					\ -	list = &(sb)->s_files;					\ -	list_for_each_entry((__file), list, f_u.fu_list) - -#define while_file_list_for_each_entry				\ -} - -#endif - -int fs_may_remount_ro(struct super_block *sb) +void __fput_sync(struct file *file)  { -	struct file *file; -	/* Check that no files are currently opened for writing. */ -	lg_global_lock(files_lglock); -	do_file_list_for_each_entry(sb, file) { -		struct inode *inode = file->f_path.dentry->d_inode; - -		/* File with pending delete? */ -		if (inode->i_nlink == 0) -			goto too_bad; - -		/* Writeable file? */ -		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) -			goto too_bad; -	} while_file_list_for_each_entry; -	lg_global_unlock(files_lglock); -	return 1; /* Tis' cool bro. */ -too_bad: -	lg_global_unlock(files_lglock); -	return 0; +	if (atomic_long_dec_and_test(&file->f_count)) { +		struct task_struct *task = current; +		BUG_ON(!(task->flags & PF_KTHREAD)); +		__fput(file); +	}  } -/** - *	mark_files_ro - mark all files read-only - *	@sb: superblock in question - * - *	All files are marked read-only.  We don't care about pending - *	delete files so this should be used in 'force' mode only. - */ -void mark_files_ro(struct super_block *sb) -{ -	struct file *f; +EXPORT_SYMBOL(fput); -retry: -	lg_global_lock(files_lglock); -	do_file_list_for_each_entry(sb, f) { -		struct vfsmount *mnt; -		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) -		       continue; -		if (!file_count(f)) -			continue; -		if (!(f->f_mode & FMODE_WRITE)) -			continue; -		spin_lock(&f->f_lock); -		f->f_mode &= ~FMODE_WRITE; -		spin_unlock(&f->f_lock); -		if (file_check_writeable(f) != 0) -			continue; -		file_release_write(f); -		mnt = mntget(f->f_path.mnt); -		/* This can sleep, so we can't hold the spinlock. */ -		lg_global_unlock(files_lglock); -		mnt_drop_write(mnt); -		mntput(mnt); -		goto retry; -	} while_file_list_for_each_entry; -	lg_global_unlock(files_lglock); +void put_filp(struct file *file) +{ +	if (atomic_long_dec_and_test(&file->f_count)) { +		security_file_free(file); +		file_free(file); +	}  }  void __init files_init(unsigned long mempages) @@ -498,7 +331,5 @@ void __init files_init(unsigned long mempages)  	n = (mempages * (PAGE_SIZE / 1024)) / 10;  	files_stat.max_files = max_t(unsigned long, n, NR_FILE); -	files_defer_init(); -	lg_lock_init(files_lglock);  	percpu_counter_init(&nr_files, 0);  }   | 
