diff options
Diffstat (limited to 'fs/namespace.c')
| -rw-r--r-- | fs/namespace.c | 2347 | 
1 files changed, 1483 insertions, 864 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 8a415c9c5e5..182bc41cd88 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -9,47 +9,60 @@   */  #include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/spinlock.h> -#include <linux/percpu.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/acct.h> +#include <linux/export.h>  #include <linux/capability.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/sysfs.h> -#include <linux/seq_file.h>  #include <linux/mnt_namespace.h> +#include <linux/user_namespace.h>  #include <linux/namei.h> -#include <linux/nsproxy.h>  #include <linux/security.h> -#include <linux/mount.h> -#include <linux/ramfs.h> -#include <linux/log2.h>  #include <linux/idr.h> -#include <linux/fs_struct.h> -#include <linux/fsnotify.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/acct.h>		/* acct_auto_close_mnt */ +#include <linux/init.h>		/* init_rootfs */ +#include <linux/fs_struct.h>	/* get_fs_root et.al. */ +#include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */ +#include <linux/uaccess.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> +#include <linux/bootmem.h>  #include "pnode.h"  #include "internal.h" -#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) -#define HASH_SIZE (1UL << HASH_SHIFT) +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; -static int event; +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ +	if (!str) +		return 0; +	mhash_entries = simple_strtoul(str, &str, 0); +	return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ +	if (!str) +		return 0; +	mphash_entries = simple_strtoul(str, &str, 0); +	return 1; +} +__setup("mphash_entries=", set_mphash_entries); + +static u64 event;  static DEFINE_IDA(mnt_id_ida);  static DEFINE_IDA(mnt_group_ida);  static DEFINE_SPINLOCK(mnt_id_lock);  static int mnt_id_start = 0;  static int mnt_group_start = 1; -static struct list_head *mount_hashtable __read_mostly; +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly;  static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +static DECLARE_RWSEM(namespace_sem);  /* /sys/fs */  struct kobject *fs_kobj; @@ -63,23 +76,28 @@ EXPORT_SYMBOL_GPL(fs_kobj);   * It should be taken for write in all cases where the vfsmount   * tree or hash is modified or when a vfsmount structure is modified.   */ -DEFINE_BRLOCK(vfsmount_lock); +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)  {  	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);  	tmp += ((unsigned long)dentry / L1_CACHE_BYTES); -	tmp = tmp + (tmp >> HASH_SHIFT); -	return tmp & (HASH_SIZE - 1); +	tmp = tmp + (tmp >> m_hash_shift); +	return &mount_hashtable[tmp & m_hash_mask];  } -#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ +	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); +	tmp = tmp + (tmp >> mp_hash_shift); +	return &mountpoint_hashtable[tmp & mp_hash_mask]; +}  /*   * allocation is serialized by namespace_sem, but we need the spinlock to   * serialize with freeing.   */ -static int mnt_alloc_id(struct vfsmount *mnt) +static int mnt_alloc_id(struct mount *mnt)  {  	int res; @@ -96,7 +114,7 @@ retry:  	return res;  } -static void mnt_free_id(struct vfsmount *mnt) +static void mnt_free_id(struct mount *mnt)  {  	int id = mnt->mnt_id;  	spin_lock(&mnt_id_lock); @@ -111,7 +129,7 @@ static void mnt_free_id(struct vfsmount *mnt)   *   * mnt_group_ida is protected by namespace_sem   */ -static int mnt_alloc_group_id(struct vfsmount *mnt) +static int mnt_alloc_group_id(struct mount *mnt)  {  	int res; @@ -130,7 +148,7 @@ static int mnt_alloc_group_id(struct vfsmount *mnt)  /*   * Release a peer group ID   */ -void mnt_release_group_id(struct vfsmount *mnt) +void mnt_release_group_id(struct mount *mnt)  {  	int id = mnt->mnt_group_id;  	ida_remove(&mnt_group_ida, id); @@ -139,9 +157,42 @@ void mnt_release_group_id(struct vfsmount *mnt)  	mnt->mnt_group_id = 0;  } -struct vfsmount *alloc_vfsmnt(const char *name) +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct mount *mnt, int n)  { -	struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); +#ifdef CONFIG_SMP +	this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else +	preempt_disable(); +	mnt->mnt_count += n; +	preempt_enable(); +#endif +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct mount *mnt) +{ +#ifdef CONFIG_SMP +	unsigned int count = 0; +	int cpu; + +	for_each_possible_cpu(cpu) { +		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; +	} + +	return count; +#else +	return mnt->mnt_count; +#endif +} + +static struct mount *alloc_vfsmnt(const char *name) +{ +	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);  	if (mnt) {  		int err; @@ -155,8 +206,18 @@ struct vfsmount *alloc_vfsmnt(const char *name)  				goto out_free_id;  		} -		atomic_set(&mnt->mnt_count, 1); -		INIT_LIST_HEAD(&mnt->mnt_hash); +#ifdef CONFIG_SMP +		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); +		if (!mnt->mnt_pcp) +			goto out_free_devname; + +		this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else +		mnt->mnt_count = 1; +		mnt->mnt_writers = 0; +#endif + +		INIT_HLIST_NODE(&mnt->mnt_hash);  		INIT_LIST_HEAD(&mnt->mnt_child);  		INIT_LIST_HEAD(&mnt->mnt_mounts);  		INIT_LIST_HEAD(&mnt->mnt_list); @@ -167,13 +228,6 @@ struct vfsmount *alloc_vfsmnt(const char *name)  #ifdef CONFIG_FSNOTIFY  		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);  #endif -#ifdef CONFIG_SMP -		mnt->mnt_writers = alloc_percpu(int); -		if (!mnt->mnt_writers) -			goto out_free_devname; -#else -		mnt->mnt_writers = 0; -#endif  	}  	return mnt; @@ -217,32 +271,32 @@ int __mnt_is_readonly(struct vfsmount *mnt)  }  EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void inc_mnt_writers(struct vfsmount *mnt) +static inline void mnt_inc_writers(struct mount *mnt)  {  #ifdef CONFIG_SMP -	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; +	this_cpu_inc(mnt->mnt_pcp->mnt_writers);  #else  	mnt->mnt_writers++;  #endif  } -static inline void dec_mnt_writers(struct vfsmount *mnt) +static inline void mnt_dec_writers(struct mount *mnt)  {  #ifdef CONFIG_SMP -	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; +	this_cpu_dec(mnt->mnt_pcp->mnt_writers);  #else  	mnt->mnt_writers--;  #endif  } -static unsigned int count_mnt_writers(struct vfsmount *mnt) +static unsigned int mnt_get_writers(struct mount *mnt)  {  #ifdef CONFIG_SMP  	unsigned int count = 0;  	int cpu;  	for_each_possible_cpu(cpu) { -		count += *per_cpu_ptr(mnt->mnt_writers, cpu); +		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;  	}  	return count; @@ -251,37 +305,45 @@ static unsigned int count_mnt_writers(struct vfsmount *mnt)  #endif  } +static int mnt_is_readonly(struct vfsmount *mnt) +{ +	if (mnt->mnt_sb->s_readonly_remount) +		return 1; +	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */ +	smp_rmb(); +	return __mnt_is_readonly(mnt); +} +  /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink().  We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem.   */  /** - * mnt_want_write - get write access to a mount - * @mnt: the mount on which to take a write + * __mnt_want_write - get write access to a mount without freeze protection + * @m: the mount on which to take a write   * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success.  When - * the write operation is finished, mnt_drop_write() - * must be called.  This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount.   */ -int mnt_want_write(struct vfsmount *mnt) +int __mnt_want_write(struct vfsmount *m)  { +	struct mount *mnt = real_mount(m);  	int ret = 0;  	preempt_disable(); -	inc_mnt_writers(mnt); +	mnt_inc_writers(mnt);  	/* -	 * The store to inc_mnt_writers must be visible before we pass +	 * The store to mnt_inc_writers must be visible before we pass  	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our  	 * incremented count after it has set MNT_WRITE_HOLD.  	 */  	smp_mb(); -	while (mnt->mnt_flags & MNT_WRITE_HOLD) +	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)  		cpu_relax();  	/*  	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -289,13 +351,32 @@ int mnt_want_write(struct vfsmount *mnt)  	 * MNT_WRITE_HOLD is cleared.  	 */  	smp_rmb(); -	if (__mnt_is_readonly(mnt)) { -		dec_mnt_writers(mnt); +	if (mnt_is_readonly(m)) { +		mnt_dec_writers(mnt);  		ret = -EROFS; -		goto out;  	} -out:  	preempt_enable(); + +	return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success.  When the write operation is + * finished, mnt_drop_write() must be called.  This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ +	int ret; + +	sb_start_write(m->mnt_sb); +	ret = __mnt_want_write(m); +	if (ret) +		sb_end_write(m->mnt_sb);  	return ret;  }  EXPORT_SYMBOL_GPL(mnt_want_write); @@ -318,13 +399,28 @@ int mnt_clone_write(struct vfsmount *mnt)  	if (__mnt_is_readonly(mnt))  		return -EROFS;  	preempt_disable(); -	inc_mnt_writers(mnt); +	mnt_inc_writers(real_mount(mnt));  	preempt_enable();  	return 0;  }  EXPORT_SYMBOL_GPL(mnt_clone_write);  /** + * __mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like __mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int __mnt_want_write_file(struct file *file) +{ +	if (!(file->f_mode & FMODE_WRITER)) +		return __mnt_want_write(file->f_path.mnt); +	else +		return mnt_clone_write(file->f_path.mnt); +} + +/**   * mnt_want_write_file - get write access to a file's mount   * @file: the file who's mount on which to take a write   * @@ -333,36 +429,63 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);   */  int mnt_want_write_file(struct file *file)  { -	struct inode *inode = file->f_dentry->d_inode; -	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) -		return mnt_want_write(file->f_path.mnt); -	else -		return mnt_clone_write(file->f_path.mnt); +	int ret; + +	sb_start_write(file->f_path.mnt->mnt_sb); +	ret = __mnt_want_write_file(file); +	if (ret) +		sb_end_write(file->f_path.mnt->mnt_sb); +	return ret;  }  EXPORT_SYMBOL_GPL(mnt_want_write_file);  /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount   * @mnt: the mount on which to give up write access   *   * Tells the low-level filesystem that we are done   * performing writes to it.  Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above.   */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt)  {  	preempt_disable(); -	dec_mnt_writers(mnt); +	mnt_dec_writers(real_mount(mnt));  	preempt_enable();  } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again.  Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ +	__mnt_drop_write(mnt); +	sb_end_write(mnt->mnt_sb); +}  EXPORT_SYMBOL_GPL(mnt_drop_write); -static int mnt_make_readonly(struct vfsmount *mnt) +void __mnt_drop_write_file(struct file *file) +{ +	__mnt_drop_write(file->f_path.mnt); +} + +void mnt_drop_write_file(struct file *file) +{ +	mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); + +static int mnt_make_readonly(struct mount *mnt)  {  	int ret = 0; -	br_write_lock(vfsmount_lock); -	mnt->mnt_flags |= MNT_WRITE_HOLD; +	lock_mount_hash(); +	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;  	/*  	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store  	 * should be visible before we do. @@ -385,87 +508,210 @@ static int mnt_make_readonly(struct vfsmount *mnt)  	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while  	 * we're counting up here.  	 */ -	if (count_mnt_writers(mnt) > 0) +	if (mnt_get_writers(mnt) > 0)  		ret = -EBUSY;  	else -		mnt->mnt_flags |= MNT_READONLY; +		mnt->mnt.mnt_flags |= MNT_READONLY;  	/*  	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers  	 * that become unheld will see MNT_READONLY.  	 */  	smp_wmb(); -	mnt->mnt_flags &= ~MNT_WRITE_HOLD; -	br_write_unlock(vfsmount_lock); +	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; +	unlock_mount_hash();  	return ret;  } -static void __mnt_unmake_readonly(struct vfsmount *mnt) +static void __mnt_unmake_readonly(struct mount *mnt)  { -	br_write_lock(vfsmount_lock); -	mnt->mnt_flags &= ~MNT_READONLY; -	br_write_unlock(vfsmount_lock); +	lock_mount_hash(); +	mnt->mnt.mnt_flags &= ~MNT_READONLY; +	unlock_mount_hash();  } -void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +int sb_prepare_remount_readonly(struct super_block *sb)  { -	mnt->mnt_sb = sb; -	mnt->mnt_root = dget(sb->s_root); -} +	struct mount *mnt; +	int err = 0; -EXPORT_SYMBOL(simple_set_mnt); +	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */ +	if (atomic_long_read(&sb->s_remove_count)) +		return -EBUSY; + +	lock_mount_hash(); +	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { +		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { +			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; +			smp_mb(); +			if (mnt_get_writers(mnt) > 0) { +				err = -EBUSY; +				break; +			} +		} +	} +	if (!err && atomic_long_read(&sb->s_remove_count)) +		err = -EBUSY; -void free_vfsmnt(struct vfsmount *mnt) +	if (!err) { +		sb->s_readonly_remount = 1; +		smp_wmb(); +	} +	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { +		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) +			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; +	} +	unlock_mount_hash(); + +	return err; +} + +static void free_vfsmnt(struct mount *mnt)  {  	kfree(mnt->mnt_devname); -	mnt_free_id(mnt);  #ifdef CONFIG_SMP -	free_percpu(mnt->mnt_writers); +	free_percpu(mnt->mnt_pcp);  #endif  	kmem_cache_free(mnt_cache, mnt);  } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ +	free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ +	struct mount *mnt; +	if (read_seqretry(&mount_lock, seq)) +		return false; +	if (bastard == NULL) +		return true; +	mnt = real_mount(bastard); +	mnt_add_count(mnt, 1); +	if (likely(!read_seqretry(&mount_lock, seq))) +		return true; +	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { +		mnt_add_count(mnt, -1); +		return false; +	} +	rcu_read_unlock(); +	mntput(bastard); +	rcu_read_lock(); +	return false; +} +  /* - * find the first or last mount at @dentry on vfsmount @mnt depending on - * @dir. If @dir is set return the first mount else return the last mount. - * vfsmount_lock must be held for read or write. + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock()   */ -struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, -			      int dir) +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)  { -	struct list_head *head = mount_hashtable + hash(mnt, dentry); -	struct list_head *tmp = head; -	struct vfsmount *p, *found = NULL; +	struct hlist_head *head = m_hash(mnt, dentry); +	struct mount *p; -	for (;;) { -		tmp = dir ? tmp->next : tmp->prev; -		p = NULL; -		if (tmp == head) -			break; -		p = list_entry(tmp, struct vfsmount, mnt_hash); -		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { -			found = p; +	hlist_for_each_entry_rcu(p, head, mnt_hash) +		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) +			return p; +	return NULL; +} + +/* + * find the last mount at @dentry on vfsmount @mnt. + * mount_lock must be held. + */ +struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) +{ +	struct mount *p, *res; +	res = p = __lookup_mnt(mnt, dentry); +	if (!p) +		goto out; +	hlist_for_each_entry_continue(p, mnt_hash) { +		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)  			break; -		} +		res = p;  	} -	return found; +out: +	return res;  }  /* - * lookup_mnt increments the ref count before returning - * the vfsmount struct. + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically.  If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount.   */  struct vfsmount *lookup_mnt(struct path *path)  { -	struct vfsmount *child_mnt; +	struct mount *child_mnt; +	struct vfsmount *m; +	unsigned seq; + +	rcu_read_lock(); +	do { +		seq = read_seqbegin(&mount_lock); +		child_mnt = __lookup_mnt(path->mnt, path->dentry); +		m = child_mnt ? &child_mnt->mnt : NULL; +	} while (!legitimize_mnt(m, seq)); +	rcu_read_unlock(); +	return m; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ +	struct hlist_head *chain = mp_hash(dentry); +	struct mountpoint *mp; +	int ret; -	br_read_lock(vfsmount_lock); -	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) -		mntget(child_mnt); -	br_read_unlock(vfsmount_lock); -	return child_mnt; +	hlist_for_each_entry(mp, chain, m_hash) { +		if (mp->m_dentry == dentry) { +			/* might be worth a WARN_ON() */ +			if (d_unlinked(dentry)) +				return ERR_PTR(-ENOENT); +			mp->m_count++; +			return mp; +		} +	} + +	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); +	if (!mp) +		return ERR_PTR(-ENOMEM); + +	ret = d_set_mounted(dentry); +	if (ret) { +		kfree(mp); +		return ERR_PTR(ret); +	} + +	mp->m_dentry = dentry; +	mp->m_count = 1; +	hlist_add_head(&mp->m_hash, chain); +	return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ +	if (!--mp->m_count) { +		struct dentry *dentry = mp->m_dentry; +		spin_lock(&dentry->d_lock); +		dentry->d_flags &= ~DCACHE_MOUNTED; +		spin_unlock(&dentry->d_lock); +		hlist_del(&mp->m_hash); +		kfree(mp); +	}  } -static inline int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct mount *mnt)  {  	return mnt->mnt_ns == current->nsproxy->mnt_ns;  } @@ -495,46 +741,51 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)  /*   * vfsmount lock must be held for write   */ -static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +static void detach_mnt(struct mount *mnt, struct path *old_path)  {  	old_path->dentry = mnt->mnt_mountpoint; -	old_path->mnt = mnt->mnt_parent; +	old_path->mnt = &mnt->mnt_parent->mnt;  	mnt->mnt_parent = mnt; -	mnt->mnt_mountpoint = mnt->mnt_root; +	mnt->mnt_mountpoint = mnt->mnt.mnt_root;  	list_del_init(&mnt->mnt_child); -	list_del_init(&mnt->mnt_hash); -	old_path->dentry->d_mounted--; +	hlist_del_init_rcu(&mnt->mnt_hash); +	put_mountpoint(mnt->mnt_mp); +	mnt->mnt_mp = NULL;  }  /*   * vfsmount lock must be held for write   */ -void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, -			struct vfsmount *child_mnt) +void mnt_set_mountpoint(struct mount *mnt, +			struct mountpoint *mp, +			struct mount *child_mnt)  { -	child_mnt->mnt_parent = mntget(mnt); -	child_mnt->mnt_mountpoint = dget(dentry); -	dentry->d_mounted++; +	mp->m_count++; +	mnt_add_count(mnt, 1);	/* essentially, that's mntget */ +	child_mnt->mnt_mountpoint = dget(mp->m_dentry); +	child_mnt->mnt_parent = mnt; +	child_mnt->mnt_mp = mp;  }  /*   * vfsmount lock must be held for write   */ -static void attach_mnt(struct vfsmount *mnt, struct path *path) +static void attach_mnt(struct mount *mnt, +			struct mount *parent, +			struct mountpoint *mp)  { -	mnt_set_mountpoint(path->mnt, path->dentry, mnt); -	list_add_tail(&mnt->mnt_hash, mount_hashtable + -			hash(path->mnt, path->dentry)); -	list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); +	mnt_set_mountpoint(parent, mp, mnt); +	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); +	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);  }  /*   * vfsmount lock must be held for write   */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct mount *mnt, struct mount *shadows)  { -	struct vfsmount *parent = mnt->mnt_parent; -	struct vfsmount *m; +	struct mount *parent = mnt->mnt_parent; +	struct mount *m;  	LIST_HEAD(head);  	struct mnt_namespace *n = parent->mnt_ns; @@ -543,15 +794,19 @@ static void commit_tree(struct vfsmount *mnt)  	list_add_tail(&head, &mnt->mnt_list);  	list_for_each_entry(m, &head, mnt_list)  		m->mnt_ns = n; +  	list_splice(&head, n->list.prev); -	list_add_tail(&mnt->mnt_hash, mount_hashtable + -				hash(parent, mnt->mnt_mountpoint)); +	if (shadows) +		hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); +	else +		hlist_add_head_rcu(&mnt->mnt_hash, +				m_hash(&parent->mnt, mnt->mnt_mountpoint));  	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);  	touch_mnt_namespace(n);  } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +static struct mount *next_mnt(struct mount *p, struct mount *root)  {  	struct list_head *next = p->mnt_mounts.next;  	if (next == &p->mnt_mounts) { @@ -564,75 +819,157 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)  			p = p->mnt_parent;  		}  	} -	return list_entry(next, struct vfsmount, mnt_child); +	return list_entry(next, struct mount, mnt_child);  } -static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +static struct mount *skip_mnt_tree(struct mount *p)  {  	struct list_head *prev = p->mnt_mounts.prev;  	while (prev != &p->mnt_mounts) { -		p = list_entry(prev, struct vfsmount, mnt_child); +		p = list_entry(prev, struct mount, mnt_child);  		prev = p->mnt_mounts.prev;  	}  	return p;  } -static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +{ +	struct mount *mnt; +	struct dentry *root; + +	if (!type) +		return ERR_PTR(-ENODEV); + +	mnt = alloc_vfsmnt(name); +	if (!mnt) +		return ERR_PTR(-ENOMEM); + +	if (flags & MS_KERNMOUNT) +		mnt->mnt.mnt_flags = MNT_INTERNAL; + +	root = mount_fs(type, flags, name, data); +	if (IS_ERR(root)) { +		mnt_free_id(mnt); +		free_vfsmnt(mnt); +		return ERR_CAST(root); +	} + +	mnt->mnt.mnt_root = root; +	mnt->mnt.mnt_sb = root->d_sb; +	mnt->mnt_mountpoint = mnt->mnt.mnt_root; +	mnt->mnt_parent = mnt; +	lock_mount_hash(); +	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); +	unlock_mount_hash(); +	return &mnt->mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +static struct mount *clone_mnt(struct mount *old, struct dentry *root,  					int flag)  { -	struct super_block *sb = old->mnt_sb; -	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); +	struct super_block *sb = old->mnt.mnt_sb; +	struct mount *mnt; +	int err; -	if (mnt) { -		if (flag & (CL_SLAVE | CL_PRIVATE)) -			mnt->mnt_group_id = 0; /* not a peer of original */ -		else -			mnt->mnt_group_id = old->mnt_group_id; - -		if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { -			int err = mnt_alloc_group_id(mnt); -			if (err) -				goto out_free; -		} +	mnt = alloc_vfsmnt(old->mnt_devname); +	if (!mnt) +		return ERR_PTR(-ENOMEM); -		mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD; -		atomic_inc(&sb->s_active); -		mnt->mnt_sb = sb; -		mnt->mnt_root = dget(root); -		mnt->mnt_mountpoint = mnt->mnt_root; -		mnt->mnt_parent = mnt; - -		if (flag & CL_SLAVE) { -			list_add(&mnt->mnt_slave, &old->mnt_slave_list); -			mnt->mnt_master = old; -			CLEAR_MNT_SHARED(mnt); -		} else if (!(flag & CL_PRIVATE)) { -			if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) -				list_add(&mnt->mnt_share, &old->mnt_share); -			if (IS_MNT_SLAVE(old)) -				list_add(&mnt->mnt_slave, &old->mnt_slave); -			mnt->mnt_master = old->mnt_master; -		} -		if (flag & CL_MAKE_SHARED) -			set_mnt_shared(mnt); - -		/* stick the duplicate mount on the same expiry list -		 * as the original if that was on one */ -		if (flag & CL_EXPIRE) { -			if (!list_empty(&old->mnt_expire)) -				list_add(&mnt->mnt_expire, &old->mnt_expire); -		} +	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) +		mnt->mnt_group_id = 0; /* not a peer of original */ +	else +		mnt->mnt_group_id = old->mnt_group_id; + +	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { +		err = mnt_alloc_group_id(mnt); +		if (err) +			goto out_free;  	} + +	mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); +	/* Don't allow unprivileged users to change mount flags */ +	if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) +		mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + +	/* Don't allow unprivileged users to reveal what is under a mount */ +	if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) +		mnt->mnt.mnt_flags |= MNT_LOCKED; + +	atomic_inc(&sb->s_active); +	mnt->mnt.mnt_sb = sb; +	mnt->mnt.mnt_root = dget(root); +	mnt->mnt_mountpoint = mnt->mnt.mnt_root; +	mnt->mnt_parent = mnt; +	lock_mount_hash(); +	list_add_tail(&mnt->mnt_instance, &sb->s_mounts); +	unlock_mount_hash(); + +	if ((flag & CL_SLAVE) || +	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { +		list_add(&mnt->mnt_slave, &old->mnt_slave_list); +		mnt->mnt_master = old; +		CLEAR_MNT_SHARED(mnt); +	} else if (!(flag & CL_PRIVATE)) { +		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) +			list_add(&mnt->mnt_share, &old->mnt_share); +		if (IS_MNT_SLAVE(old)) +			list_add(&mnt->mnt_slave, &old->mnt_slave); +		mnt->mnt_master = old->mnt_master; +	} +	if (flag & CL_MAKE_SHARED) +		set_mnt_shared(mnt); + +	/* stick the duplicate mount on the same expiry list +	 * as the original if that was on one */ +	if (flag & CL_EXPIRE) { +		if (!list_empty(&old->mnt_expire)) +			list_add(&mnt->mnt_expire, &old->mnt_expire); +	} +  	return mnt;   out_free: +	mnt_free_id(mnt);  	free_vfsmnt(mnt); -	return NULL; +	return ERR_PTR(err);  } -static inline void __mntput(struct vfsmount *mnt) +static void mntput_no_expire(struct mount *mnt)  { -	struct super_block *sb = mnt->mnt_sb; +put_again: +	rcu_read_lock(); +	mnt_add_count(mnt, -1); +	if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ +		rcu_read_unlock(); +		return; +	} +	lock_mount_hash(); +	if (mnt_get_count(mnt)) { +		rcu_read_unlock(); +		unlock_mount_hash(); +		return; +	} +	if (unlikely(mnt->mnt_pinned)) { +		mnt_add_count(mnt, mnt->mnt_pinned + 1); +		mnt->mnt_pinned = 0; +		rcu_read_unlock(); +		unlock_mount_hash(); +		acct_auto_close_mnt(&mnt->mnt); +		goto put_again; +	} +	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { +		rcu_read_unlock(); +		unlock_mount_hash(); +		return; +	} +	mnt->mnt.mnt_flags |= MNT_DOOMED; +	rcu_read_unlock(); + +	list_del(&mnt->mnt_instance); +	unlock_mount_hash(); +  	/*  	 * This probably indicates that somebody messed  	 * up a mnt_want/drop_write() pair.  If this @@ -640,58 +977,55 @@ static inline void __mntput(struct vfsmount *mnt)  	 * to make r/w->r/o transitions.  	 */  	/* -	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements -	 * provides barriers, so count_mnt_writers() below is safe.  AV +	 * The locking used to deal with mnt_count decrement provides barriers, +	 * so mnt_get_writers() below is safe.  	 */ -	WARN_ON(count_mnt_writers(mnt)); -	fsnotify_vfsmount_delete(mnt); -	dput(mnt->mnt_root); -	free_vfsmnt(mnt); -	deactivate_super(sb); +	WARN_ON(mnt_get_writers(mnt)); +	fsnotify_vfsmount_delete(&mnt->mnt); +	dput(mnt->mnt.mnt_root); +	deactivate_super(mnt->mnt.mnt_sb); +	mnt_free_id(mnt); +	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);  } -void mntput_no_expire(struct vfsmount *mnt) +void mntput(struct vfsmount *mnt)  { -repeat: -	if (atomic_add_unless(&mnt->mnt_count, -1, 1)) -		return; -	br_write_lock(vfsmount_lock); -	if (!atomic_dec_and_test(&mnt->mnt_count)) { -		br_write_unlock(vfsmount_lock); -		return; -	} -	if (likely(!mnt->mnt_pinned)) { -		br_write_unlock(vfsmount_lock); -		__mntput(mnt); -		return; +	if (mnt) { +		struct mount *m = real_mount(mnt); +		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */ +		if (unlikely(m->mnt_expiry_mark)) +			m->mnt_expiry_mark = 0; +		mntput_no_expire(m);  	} -	atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); -	mnt->mnt_pinned = 0; -	br_write_unlock(vfsmount_lock); -	acct_auto_close_mnt(mnt); -	goto repeat;  } -EXPORT_SYMBOL(mntput_no_expire); +EXPORT_SYMBOL(mntput); -void mnt_pin(struct vfsmount *mnt) +struct vfsmount *mntget(struct vfsmount *mnt)  { -	br_write_lock(vfsmount_lock); -	mnt->mnt_pinned++; -	br_write_unlock(vfsmount_lock); +	if (mnt) +		mnt_add_count(real_mount(mnt), 1); +	return mnt;  } +EXPORT_SYMBOL(mntget); +void mnt_pin(struct vfsmount *mnt) +{ +	lock_mount_hash(); +	real_mount(mnt)->mnt_pinned++; +	unlock_mount_hash(); +}  EXPORT_SYMBOL(mnt_pin); -void mnt_unpin(struct vfsmount *mnt) +void mnt_unpin(struct vfsmount *m)  { -	br_write_lock(vfsmount_lock); +	struct mount *mnt = real_mount(m); +	lock_mount_hash();  	if (mnt->mnt_pinned) { -		atomic_inc(&mnt->mnt_count); +		mnt_add_count(mnt, 1);  		mnt->mnt_pinned--;  	} -	br_write_unlock(vfsmount_lock); +	unlock_mount_hash();  } -  EXPORT_SYMBOL(mnt_unpin);  static inline void mangle(struct seq_file *m, const char *s) @@ -705,12 +1039,12 @@ static inline void mangle(struct seq_file *m, const char *s)   *   * See also save_mount_options().   */ -int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +int generic_show_options(struct seq_file *m, struct dentry *root)  {  	const char *options;  	rcu_read_lock(); -	options = rcu_dereference(mnt->mnt_sb->s_options); +	options = rcu_dereference(root->d_sb->s_options);  	if (options != NULL && options[0]) {  		seq_putc(m, ','); @@ -754,20 +1088,35 @@ void replace_mount_options(struct super_block *sb, char *options)  EXPORT_SYMBOL(replace_mount_options);  #ifdef CONFIG_PROC_FS -/* iterator */ +/* iterator; we want it to have access to namespace_sem, thus here... */  static void *m_start(struct seq_file *m, loff_t *pos)  { -	struct proc_mounts *p = m->private; +	struct proc_mounts *p = proc_mounts(m);  	down_read(&namespace_sem); -	return seq_list_start(&p->ns->list, *pos); +	if (p->cached_event == p->ns->event) { +		void *v = p->cached_mount; +		if (*pos == p->cached_index) +			return v; +		if (*pos == p->cached_index + 1) { +			v = seq_list_next(v, &p->ns->list, &p->cached_index); +			return p->cached_mount = v; +		} +	} + +	p->cached_event = p->ns->event; +	p->cached_mount = seq_list_start(&p->ns->list, *pos); +	p->cached_index = *pos; +	return p->cached_mount;  }  static void *m_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct proc_mounts *p = m->private; +	struct proc_mounts *p = proc_mounts(m); -	return seq_list_next(v, &p->ns->list, pos); +	p->cached_mount = seq_list_next(v, &p->ns->list, pos); +	p->cached_index = *pos; +	return p->cached_mount;  }  static void m_stop(struct seq_file *m, void *v) @@ -775,200 +1124,18 @@ static void m_stop(struct seq_file *m, void *v)  	up_read(&namespace_sem);  } -int mnt_had_events(struct proc_mounts *p) +static int m_show(struct seq_file *m, void *v)  { -	struct mnt_namespace *ns = p->ns; -	int res = 0; - -	br_read_lock(vfsmount_lock); -	if (p->event != ns->event) { -		p->event = ns->event; -		res = 1; -	} -	br_read_unlock(vfsmount_lock); - -	return res; -} - -struct proc_fs_info { -	int flag; -	const char *str; -}; - -static int show_sb_opts(struct seq_file *m, struct super_block *sb) -{ -	static const struct proc_fs_info fs_info[] = { -		{ MS_SYNCHRONOUS, ",sync" }, -		{ MS_DIRSYNC, ",dirsync" }, -		{ MS_MANDLOCK, ",mand" }, -		{ 0, NULL } -	}; -	const struct proc_fs_info *fs_infop; - -	for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { -		if (sb->s_flags & fs_infop->flag) -			seq_puts(m, fs_infop->str); -	} - -	return security_sb_show_options(m, sb); -} - -static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) -{ -	static const struct proc_fs_info mnt_info[] = { -		{ MNT_NOSUID, ",nosuid" }, -		{ MNT_NODEV, ",nodev" }, -		{ MNT_NOEXEC, ",noexec" }, -		{ MNT_NOATIME, ",noatime" }, -		{ MNT_NODIRATIME, ",nodiratime" }, -		{ MNT_RELATIME, ",relatime" }, -		{ 0, NULL } -	}; -	const struct proc_fs_info *fs_infop; - -	for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { -		if (mnt->mnt_flags & fs_infop->flag) -			seq_puts(m, fs_infop->str); -	} -} - -static void show_type(struct seq_file *m, struct super_block *sb) -{ -	mangle(m, sb->s_type->name); -	if (sb->s_subtype && sb->s_subtype[0]) { -		seq_putc(m, '.'); -		mangle(m, sb->s_subtype); -	} -} - -static int show_vfsmnt(struct seq_file *m, void *v) -{ -	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); -	int err = 0; -	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - -	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -	seq_putc(m, ' '); -	seq_path(m, &mnt_path, " \t\n\\"); -	seq_putc(m, ' '); -	show_type(m, mnt->mnt_sb); -	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); -	err = show_sb_opts(m, mnt->mnt_sb); -	if (err) -		goto out; -	show_mnt_opts(m, mnt); -	if (mnt->mnt_sb->s_op->show_options) -		err = mnt->mnt_sb->s_op->show_options(m, mnt); -	seq_puts(m, " 0 0\n"); -out: -	return err; +	struct proc_mounts *p = proc_mounts(m); +	struct mount *r = list_entry(v, struct mount, mnt_list); +	return p->show(m, &r->mnt);  }  const struct seq_operations mounts_op = {  	.start	= m_start,  	.next	= m_next,  	.stop	= m_stop, -	.show	= show_vfsmnt -}; - -static int show_mountinfo(struct seq_file *m, void *v) -{ -	struct proc_mounts *p = m->private; -	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); -	struct super_block *sb = mnt->mnt_sb; -	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; -	struct path root = p->root; -	int err = 0; - -	seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, -		   MAJOR(sb->s_dev), MINOR(sb->s_dev)); -	seq_dentry(m, mnt->mnt_root, " \t\n\\"); -	seq_putc(m, ' '); -	seq_path_root(m, &mnt_path, &root, " \t\n\\"); -	if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { -		/* -		 * Mountpoint is outside root, discard that one.  Ugly, -		 * but less so than trying to do that in iterator in a -		 * race-free way (due to renames). -		 */ -		return SEQ_SKIP; -	} -	seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); -	show_mnt_opts(m, mnt); - -	/* Tagged fields ("foo:X" or "bar") */ -	if (IS_MNT_SHARED(mnt)) -		seq_printf(m, " shared:%i", mnt->mnt_group_id); -	if (IS_MNT_SLAVE(mnt)) { -		int master = mnt->mnt_master->mnt_group_id; -		int dom = get_dominating_id(mnt, &p->root); -		seq_printf(m, " master:%i", master); -		if (dom && dom != master) -			seq_printf(m, " propagate_from:%i", dom); -	} -	if (IS_MNT_UNBINDABLE(mnt)) -		seq_puts(m, " unbindable"); - -	/* Filesystem specific data */ -	seq_puts(m, " - "); -	show_type(m, sb); -	seq_putc(m, ' '); -	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); -	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); -	err = show_sb_opts(m, sb); -	if (err) -		goto out; -	if (sb->s_op->show_options) -		err = sb->s_op->show_options(m, mnt); -	seq_putc(m, '\n'); -out: -	return err; -} - -const struct seq_operations mountinfo_op = { -	.start	= m_start, -	.next	= m_next, -	.stop	= m_stop, -	.show	= show_mountinfo, -}; - -static int show_vfsstat(struct seq_file *m, void *v) -{ -	struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); -	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; -	int err = 0; - -	/* device */ -	if (mnt->mnt_devname) { -		seq_puts(m, "device "); -		mangle(m, mnt->mnt_devname); -	} else -		seq_puts(m, "no device"); - -	/* mount point */ -	seq_puts(m, " mounted on "); -	seq_path(m, &mnt_path, " \t\n\\"); -	seq_putc(m, ' '); - -	/* file system type */ -	seq_puts(m, "with fstype "); -	show_type(m, mnt->mnt_sb); - -	/* optional statistics */ -	if (mnt->mnt_sb->s_op->show_stats) { -		seq_putc(m, ' '); -		err = mnt->mnt_sb->s_op->show_stats(m, mnt); -	} - -	seq_putc(m, '\n'); -	return err; -} - -const struct seq_operations mountstats_op = { -	.start	= m_start, -	.next	= m_next, -	.stop	= m_stop, -	.show	= show_vfsstat, +	.show	= m_show,  };  #endif  /* CONFIG_PROC_FS */ @@ -980,18 +1147,21 @@ const struct seq_operations mountstats_op = {   * open files, pwds, chroots or sub mounts that are   * busy.   */ -int may_umount_tree(struct vfsmount *mnt) +int may_umount_tree(struct vfsmount *m)  { +	struct mount *mnt = real_mount(m);  	int actual_refs = 0;  	int minimum_refs = 0; -	struct vfsmount *p; +	struct mount *p; +	BUG_ON(!m); -	br_read_lock(vfsmount_lock); +	/* write lock needed for mnt_get_count */ +	lock_mount_hash();  	for (p = mnt; p; p = next_mnt(p, mnt)) { -		actual_refs += atomic_read(&p->mnt_count); +		actual_refs += mnt_get_count(p);  		minimum_refs += 2;  	} -	br_read_unlock(vfsmount_lock); +	unlock_mount_hash();  	if (actual_refs > minimum_refs)  		return 0; @@ -1018,77 +1188,105 @@ int may_umount(struct vfsmount *mnt)  {  	int ret = 1;  	down_read(&namespace_sem); -	br_read_lock(vfsmount_lock); -	if (propagate_mount_busy(mnt, 2)) +	lock_mount_hash(); +	if (propagate_mount_busy(real_mount(mnt), 2))  		ret = 0; -	br_read_unlock(vfsmount_lock); +	unlock_mount_hash();  	up_read(&namespace_sem);  	return ret;  }  EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static HLIST_HEAD(unmounted);	/* protected by namespace_sem */ + +static void namespace_unlock(void)  { -	struct vfsmount *mnt; -	while (!list_empty(head)) { -		mnt = list_first_entry(head, struct vfsmount, mnt_hash); -		list_del_init(&mnt->mnt_hash); -		if (mnt->mnt_parent != mnt) { -			struct dentry *dentry; -			struct vfsmount *m; - -			br_write_lock(vfsmount_lock); -			dentry = mnt->mnt_mountpoint; -			m = mnt->mnt_parent; -			mnt->mnt_mountpoint = mnt->mnt_root; -			mnt->mnt_parent = mnt; -			m->mnt_ghosts--; -			br_write_unlock(vfsmount_lock); -			dput(dentry); -			mntput(m); -		} -		mntput(mnt); +	struct mount *mnt; +	struct hlist_head head = unmounted; + +	if (likely(hlist_empty(&head))) { +		up_write(&namespace_sem); +		return;  	} + +	head.first->pprev = &head.first; +	INIT_HLIST_HEAD(&unmounted); + +	up_write(&namespace_sem); + +	synchronize_rcu(); + +	while (!hlist_empty(&head)) { +		mnt = hlist_entry(head.first, struct mount, mnt_hash); +		hlist_del_init(&mnt->mnt_hash); +		if (mnt->mnt_ex_mountpoint.mnt) +			path_put(&mnt->mnt_ex_mountpoint); +		mntput(&mnt->mnt); +	} +} + +static inline void namespace_lock(void) +{ +	down_write(&namespace_sem);  }  /* - * vfsmount lock must be held for write + * mount_lock must be held   * namespace_sem must be held for write + * how = 0 => just this tree, don't propagate + * how = 1 => propagate; we know that nobody else has reference to any victims + * how = 2 => lazy umount   */ -void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +void umount_tree(struct mount *mnt, int how)  { -	struct vfsmount *p; +	HLIST_HEAD(tmp_list); +	struct mount *p; +	struct mount *last = NULL; -	for (p = mnt; p; p = next_mnt(p, mnt)) -		list_move(&p->mnt_hash, kill); +	for (p = mnt; p; p = next_mnt(p, mnt)) { +		hlist_del_init_rcu(&p->mnt_hash); +		hlist_add_head(&p->mnt_hash, &tmp_list); +	} -	if (propagate) -		propagate_umount(kill); +	if (how) +		propagate_umount(&tmp_list); -	list_for_each_entry(p, kill, mnt_hash) { +	hlist_for_each_entry(p, &tmp_list, mnt_hash) {  		list_del_init(&p->mnt_expire);  		list_del_init(&p->mnt_list);  		__touch_mnt_namespace(p->mnt_ns);  		p->mnt_ns = NULL; +		if (how < 2) +			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;  		list_del_init(&p->mnt_child); -		if (p->mnt_parent != p) { -			p->mnt_parent->mnt_ghosts++; -			p->mnt_mountpoint->d_mounted--; +		if (mnt_has_parent(p)) { +			put_mountpoint(p->mnt_mp); +			/* move the reference to mountpoint into ->mnt_ex_mountpoint */ +			p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; +			p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; +			p->mnt_mountpoint = p->mnt.mnt_root; +			p->mnt_parent = p; +			p->mnt_mp = NULL;  		}  		change_mnt_propagation(p, MS_PRIVATE); +		last = p; +	} +	if (last) { +		last->mnt_hash.next = unmounted.first; +		unmounted.first = tmp_list.first; +		unmounted.first->pprev = &unmounted.first;  	}  } -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); -static int do_umount(struct vfsmount *mnt, int flags) +static int do_umount(struct mount *mnt, int flags)  { -	struct super_block *sb = mnt->mnt_sb; +	struct super_block *sb = mnt->mnt.mnt_sb;  	int retval; -	LIST_HEAD(umount_list); -	retval = security_sb_umount(mnt, flags); +	retval = security_sb_umount(&mnt->mnt, flags);  	if (retval)  		return retval; @@ -1099,12 +1297,20 @@ static int do_umount(struct vfsmount *mnt, int flags)  	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]  	 */  	if (flags & MNT_EXPIRE) { -		if (mnt == current->fs->root.mnt || +		if (&mnt->mnt == current->fs->root.mnt ||  		    flags & (MNT_FORCE | MNT_DETACH))  			return -EINVAL; -		if (atomic_read(&mnt->mnt_count) != 2) +		/* +		 * probably don't strictly need the lock here if we examined +		 * all race cases, but it's a slowpath. +		 */ +		lock_mount_hash(); +		if (mnt_get_count(mnt) != 2) { +			unlock_mount_hash();  			return -EBUSY; +		} +		unlock_mount_hash();  		if (!xchg(&mnt->mnt_expiry_mark, 1))  			return -EAGAIN; @@ -1133,7 +1339,7 @@ static int do_umount(struct vfsmount *mnt, int flags)  	 * /reboot - static binary that would close all descriptors and  	 * call reboot(9). Then init(8) could umount root and exec /reboot.  	 */ -	if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { +	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {  		/*  		 * Special case for "unmounting" root ...  		 * we just try to remount it readonly. @@ -1145,25 +1351,36 @@ static int do_umount(struct vfsmount *mnt, int flags)  		return retval;  	} -	down_write(&namespace_sem); -	br_write_lock(vfsmount_lock); +	namespace_lock(); +	lock_mount_hash();  	event++; -	if (!(flags & MNT_DETACH)) -		shrink_submounts(mnt, &umount_list); - -	retval = -EBUSY; -	if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { +	if (flags & MNT_DETACH) {  		if (!list_empty(&mnt->mnt_list)) -			umount_tree(mnt, 1, &umount_list); +			umount_tree(mnt, 2);  		retval = 0; +	} else { +		shrink_submounts(mnt); +		retval = -EBUSY; +		if (!propagate_mount_busy(mnt, 2)) { +			if (!list_empty(&mnt->mnt_list)) +				umount_tree(mnt, 1); +			retval = 0; +		}  	} -	br_write_unlock(vfsmount_lock); -	up_write(&namespace_sem); -	release_mounts(&umount_list); +	unlock_mount_hash(); +	namespace_unlock();  	return retval;  } +/*  + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ +	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} +  /*   * Now umount can handle mount points as well as block devices.   * This is important for filesystems which use unnamed block devices. @@ -1175,33 +1392,36 @@ static int do_umount(struct vfsmount *mnt, int flags)  SYSCALL_DEFINE2(umount, char __user *, name, int, flags)  {  	struct path path; +	struct mount *mnt;  	int retval;  	int lookup_flags = 0;  	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))  		return -EINVAL; +	if (!may_mount()) +		return -EPERM; +  	if (!(flags & UMOUNT_NOFOLLOW))  		lookup_flags |= LOOKUP_FOLLOW; -	retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); +	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);  	if (retval)  		goto out; +	mnt = real_mount(path.mnt);  	retval = -EINVAL;  	if (path.dentry != path.mnt->mnt_root)  		goto dput_and_out; -	if (!check_mnt(path.mnt)) +	if (!check_mnt(mnt))  		goto dput_and_out; - -	retval = -EPERM; -	if (!capable(CAP_SYS_ADMIN)) +	if (mnt->mnt.mnt_flags & MNT_LOCKED)  		goto dput_and_out; -	retval = do_umount(path.mnt, flags); +	retval = do_umount(mnt, flags);  dput_and_out:  	/* we mustn't call path_put() as that would clear mnt_expiry_mark */  	dput(path.dentry); -	mntput_no_expire(path.mnt); +	mntput_no_expire(mnt);  out:  	return retval;  } @@ -1218,45 +1438,67 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)  #endif -static int mount_is_safe(struct path *path) +static bool is_mnt_ns_file(struct dentry *dentry)  { -	if (capable(CAP_SYS_ADMIN)) -		return 0; -	return -EPERM; -#ifdef notyet -	if (S_ISLNK(path->dentry->d_inode->i_mode)) -		return -EPERM; -	if (path->dentry->d_inode->i_mode & S_ISVTX) { -		if (current_uid() != path->dentry->d_inode->i_uid) -			return -EPERM; -	} -	if (inode_permission(path->dentry->d_inode, MAY_WRITE)) -		return -EPERM; -	return 0; -#endif +	/* Is this a proxy for a mount namespace? */ +	struct inode *inode = dentry->d_inode; +	struct proc_ns *ei; + +	if (!proc_ns_inode(inode)) +		return false; + +	ei = get_proc_ns(inode); +	if (ei->ns_ops != &mntns_operations) +		return false; + +	return true; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ +	/* Could bind mounting the mount namespace inode cause a +	 * mount namespace loop? +	 */ +	struct mnt_namespace *mnt_ns; +	if (!is_mnt_ns_file(dentry)) +		return false; + +	mnt_ns = get_proc_ns(dentry->d_inode)->ns; +	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;  } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,  					int flag)  { -	struct vfsmount *res, *p, *q, *r, *s; -	struct path path; +	struct mount *res, *p, *q, *r, *parent; + +	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) +		return ERR_PTR(-EINVAL); -	if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) -		return NULL; +	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) +		return ERR_PTR(-EINVAL);  	res = q = clone_mnt(mnt, dentry, flag); -	if (!q) -		goto Enomem; +	if (IS_ERR(q)) +		return q; + +	q->mnt.mnt_flags &= ~MNT_LOCKED;  	q->mnt_mountpoint = mnt->mnt_mountpoint;  	p = mnt;  	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { +		struct mount *s;  		if (!is_subdir(r->mnt_mountpoint, dentry))  			continue;  		for (s = r; s; s = next_mnt(s, r)) { -			if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { +			if (!(flag & CL_COPY_UNBINDABLE) && +			    IS_MNT_UNBINDABLE(s)) { +				s = skip_mnt_tree(s); +				continue; +			} +			if (!(flag & CL_COPY_MNT_NS_FILE) && +			    is_mnt_ns_file(s->mnt.mnt_root)) {  				s = skip_mnt_tree(s);  				continue;  			} @@ -1265,67 +1507,67 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,  				q = q->mnt_parent;  			}  			p = s; -			path.mnt = q; -			path.dentry = p->mnt_mountpoint; -			q = clone_mnt(p, p->mnt_root, flag); -			if (!q) -				goto Enomem; -			br_write_lock(vfsmount_lock); +			parent = q; +			q = clone_mnt(p, p->mnt.mnt_root, flag); +			if (IS_ERR(q)) +				goto out; +			lock_mount_hash();  			list_add_tail(&q->mnt_list, &res->mnt_list); -			attach_mnt(q, &path); -			br_write_unlock(vfsmount_lock); +			attach_mnt(q, parent, p->mnt_mp); +			unlock_mount_hash();  		}  	}  	return res; -Enomem: +out:  	if (res) { -		LIST_HEAD(umount_list); -		br_write_lock(vfsmount_lock); -		umount_tree(res, 0, &umount_list); -		br_write_unlock(vfsmount_lock); -		release_mounts(&umount_list); +		lock_mount_hash(); +		umount_tree(res, 0); +		unlock_mount_hash();  	} -	return NULL; +	return q;  } +/* Caller should check returned pointer for errors */ +  struct vfsmount *collect_mounts(struct path *path)  { -	struct vfsmount *tree; -	down_write(&namespace_sem); -	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); -	up_write(&namespace_sem); -	return tree; +	struct mount *tree; +	namespace_lock(); +	tree = copy_tree(real_mount(path->mnt), path->dentry, +			 CL_COPY_ALL | CL_PRIVATE); +	namespace_unlock(); +	if (IS_ERR(tree)) +		return ERR_CAST(tree); +	return &tree->mnt;  }  void drop_collected_mounts(struct vfsmount *mnt)  { -	LIST_HEAD(umount_list); -	down_write(&namespace_sem); -	br_write_lock(vfsmount_lock); -	umount_tree(mnt, 0, &umount_list); -	br_write_unlock(vfsmount_lock); -	up_write(&namespace_sem); -	release_mounts(&umount_list); +	namespace_lock(); +	lock_mount_hash(); +	umount_tree(real_mount(mnt), 0); +	unlock_mount_hash(); +	namespace_unlock();  }  int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,  		   struct vfsmount *root)  { -	struct vfsmount *mnt; +	struct mount *mnt;  	int res = f(root, arg);  	if (res)  		return res; -	list_for_each_entry(mnt, &root->mnt_list, mnt_list) { -		res = f(mnt, arg); +	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { +		res = f(&mnt->mnt, arg);  		if (res)  			return res;  	}  	return 0;  } -static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +static void cleanup_group_ids(struct mount *mnt, struct mount *end)  { -	struct vfsmount *p; +	struct mount *p;  	for (p = mnt; p != end; p = next_mnt(p, mnt)) {  		if (p->mnt_group_id && !IS_MNT_SHARED(p)) @@ -1333,9 +1575,9 @@ static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)  	}  } -static int invent_group_ids(struct vfsmount *mnt, bool recurse) +static int invent_group_ids(struct mount *mnt, bool recurse)  { -	struct vfsmount *p; +	struct mount *p;  	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {  		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { @@ -1413,74 +1655,107 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse)   * Must be called without spinlocks held, since this function can sleep   * in allocations.   */ -static int attach_recursive_mnt(struct vfsmount *source_mnt, -			struct path *path, struct path *parent_path) -{ -	LIST_HEAD(tree_list); -	struct vfsmount *dest_mnt = path->mnt; -	struct dentry *dest_dentry = path->dentry; -	struct vfsmount *child, *p; +static int attach_recursive_mnt(struct mount *source_mnt, +			struct mount *dest_mnt, +			struct mountpoint *dest_mp, +			struct path *parent_path) +{ +	HLIST_HEAD(tree_list); +	struct mount *child, *p; +	struct hlist_node *n;  	int err;  	if (IS_MNT_SHARED(dest_mnt)) {  		err = invent_group_ids(source_mnt, true);  		if (err)  			goto out; -	} -	err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); -	if (err) -		goto out_cleanup_ids; - -	br_write_lock(vfsmount_lock); - -	if (IS_MNT_SHARED(dest_mnt)) { +		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); +		lock_mount_hash(); +		if (err) +			goto out_cleanup_ids;  		for (p = source_mnt; p; p = next_mnt(p, source_mnt))  			set_mnt_shared(p); +	} else { +		lock_mount_hash();  	}  	if (parent_path) {  		detach_mnt(source_mnt, parent_path); -		attach_mnt(source_mnt, path); -		touch_mnt_namespace(parent_path->mnt->mnt_ns); +		attach_mnt(source_mnt, dest_mnt, dest_mp); +		touch_mnt_namespace(source_mnt->mnt_ns);  	} else { -		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); -		commit_tree(source_mnt); +		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); +		commit_tree(source_mnt, NULL);  	} -	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { -		list_del_init(&child->mnt_hash); -		commit_tree(child); +	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { +		struct mount *q; +		hlist_del_init(&child->mnt_hash); +		q = __lookup_mnt_last(&child->mnt_parent->mnt, +				      child->mnt_mountpoint); +		commit_tree(child, q);  	} -	br_write_unlock(vfsmount_lock); +	unlock_mount_hash();  	return 0;   out_cleanup_ids: -	if (IS_MNT_SHARED(dest_mnt)) -		cleanup_group_ids(source_mnt, NULL); +	while (!hlist_empty(&tree_list)) { +		child = hlist_entry(tree_list.first, struct mount, mnt_hash); +		umount_tree(child, 0); +	} +	unlock_mount_hash(); +	cleanup_group_ids(source_mnt, NULL);   out:  	return err;  } -static int graft_tree(struct vfsmount *mnt, struct path *path) +static struct mountpoint *lock_mount(struct path *path)  { -	int err; -	if (mnt->mnt_sb->s_flags & MS_NOUSER) +	struct vfsmount *mnt; +	struct dentry *dentry = path->dentry; +retry: +	mutex_lock(&dentry->d_inode->i_mutex); +	if (unlikely(cant_mount(dentry))) { +		mutex_unlock(&dentry->d_inode->i_mutex); +		return ERR_PTR(-ENOENT); +	} +	namespace_lock(); +	mnt = lookup_mnt(path); +	if (likely(!mnt)) { +		struct mountpoint *mp = new_mountpoint(dentry); +		if (IS_ERR(mp)) { +			namespace_unlock(); +			mutex_unlock(&dentry->d_inode->i_mutex); +			return mp; +		} +		return mp; +	} +	namespace_unlock(); +	mutex_unlock(&path->dentry->d_inode->i_mutex); +	path_put(path); +	path->mnt = mnt; +	dentry = path->dentry = dget(mnt->mnt_root); +	goto retry; +} + +static void unlock_mount(struct mountpoint *where) +{ +	struct dentry *dentry = where->m_dentry; +	put_mountpoint(where); +	namespace_unlock(); +	mutex_unlock(&dentry->d_inode->i_mutex); +} + +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +{ +	if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)  		return -EINVAL; -	if (S_ISDIR(path->dentry->d_inode->i_mode) != -	      S_ISDIR(mnt->mnt_root->d_inode->i_mode)) +	if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != +	      S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))  		return -ENOTDIR; -	err = -ENOENT; -	mutex_lock(&path->dentry->d_inode->i_mutex); -	if (cant_mount(path->dentry)) -		goto out_unlock; - -	if (!d_unlinked(path->dentry)) -		err = attach_recursive_mnt(mnt, path, NULL); -out_unlock: -	mutex_unlock(&path->dentry->d_inode->i_mutex); -	return err; +	return attach_recursive_mnt(mnt, p, mp, NULL);  }  /* @@ -1489,7 +1764,7 @@ out_unlock:  static int flags_to_propagation_type(int flags)  { -	int type = flags & ~MS_REC; +	int type = flags & ~(MS_REC | MS_SILENT);  	/* Fail if any non-propagation flags are set */  	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) @@ -1505,14 +1780,12 @@ static int flags_to_propagation_type(int flags)   */  static int do_change_type(struct path *path, int flag)  { -	struct vfsmount *m, *mnt = path->mnt; +	struct mount *m; +	struct mount *mnt = real_mount(path->mnt);  	int recurse = flag & MS_REC;  	int type;  	int err = 0; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; -  	if (path->dentry != path->mnt->mnt_root)  		return -EINVAL; @@ -1520,69 +1793,95 @@ static int do_change_type(struct path *path, int flag)  	if (!type)  		return -EINVAL; -	down_write(&namespace_sem); +	namespace_lock();  	if (type == MS_SHARED) {  		err = invent_group_ids(mnt, recurse);  		if (err)  			goto out_unlock;  	} -	br_write_lock(vfsmount_lock); +	lock_mount_hash();  	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))  		change_mnt_propagation(m, type); -	br_write_unlock(vfsmount_lock); +	unlock_mount_hash();   out_unlock: -	up_write(&namespace_sem); +	namespace_unlock();  	return err;  } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ +	struct mount *child; +	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { +		if (!is_subdir(child->mnt_mountpoint, dentry)) +			continue; + +		if (child->mnt.mnt_flags & MNT_LOCKED) +			return true; +	} +	return false; +} +  /*   * do loopback mount.   */ -static int do_loopback(struct path *path, char *old_name, +static int do_loopback(struct path *path, const char *old_name,  				int recurse)  {  	struct path old_path; -	struct vfsmount *mnt = NULL; -	int err = mount_is_safe(path); -	if (err) -		return err; +	struct mount *mnt = NULL, *old, *parent; +	struct mountpoint *mp; +	int err;  	if (!old_name || !*old_name)  		return -EINVAL; -	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); +	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);  	if (err)  		return err; -	down_write(&namespace_sem);  	err = -EINVAL; -	if (IS_MNT_UNBINDABLE(old_path.mnt)) -		goto out; +	if (mnt_ns_loop(old_path.dentry)) +		goto out;  -	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) +	mp = lock_mount(path); +	err = PTR_ERR(mp); +	if (IS_ERR(mp))  		goto out; -	err = -ENOMEM; +	old = real_mount(old_path.mnt); +	parent = real_mount(path->mnt); + +	err = -EINVAL; +	if (IS_MNT_UNBINDABLE(old)) +		goto out2; + +	if (!check_mnt(parent) || !check_mnt(old)) +		goto out2; + +	if (!recurse && has_locked_children(old, old_path.dentry)) +		goto out2; +  	if (recurse) -		mnt = copy_tree(old_path.mnt, old_path.dentry, 0); +		mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);  	else -		mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); +		mnt = clone_mnt(old, old_path.dentry, 0); -	if (!mnt) -		goto out; +	if (IS_ERR(mnt)) { +		err = PTR_ERR(mnt); +		goto out2; +	} -	err = graft_tree(mnt, path); -	if (err) { -		LIST_HEAD(umount_list); +	mnt->mnt.mnt_flags &= ~MNT_LOCKED; -		br_write_lock(vfsmount_lock); -		umount_tree(mnt, 0, &umount_list); -		br_write_unlock(vfsmount_lock); -		release_mounts(&umount_list); +	err = graft_tree(mnt, parent, mp); +	if (err) { +		lock_mount_hash(); +		umount_tree(mnt, 0); +		unlock_mount_hash();  	} - +out2: +	unlock_mount(mp);  out: -	up_write(&namespace_sem);  	path_put(&old_path);  	return err;  } @@ -1597,10 +1896,13 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)  	if (readonly_request == __mnt_is_readonly(mnt))  		return 0; +	if (mnt->mnt_flags & MNT_LOCK_READONLY) +		return -EPERM; +  	if (readonly_request) -		error = mnt_make_readonly(mnt); +		error = mnt_make_readonly(real_mount(mnt));  	else -		__mnt_unmake_readonly(mnt); +		__mnt_unmake_readonly(real_mount(mnt));  	return error;  } @@ -1614,39 +1916,39 @@ static int do_remount(struct path *path, int flags, int mnt_flags,  {  	int err;  	struct super_block *sb = path->mnt->mnt_sb; +	struct mount *mnt = real_mount(path->mnt); -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; - -	if (!check_mnt(path->mnt)) +	if (!check_mnt(mnt))  		return -EINVAL;  	if (path->dentry != path->mnt->mnt_root)  		return -EINVAL; +	err = security_sb_remount(sb, data); +	if (err) +		return err; +  	down_write(&sb->s_umount);  	if (flags & MS_BIND)  		err = change_mount_flags(path->mnt, flags); +	else if (!capable(CAP_SYS_ADMIN)) +		err = -EPERM;  	else  		err = do_remount_sb(sb, flags, data, 0);  	if (!err) { -		br_write_lock(vfsmount_lock); -		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; -		path->mnt->mnt_flags = mnt_flags; -		br_write_unlock(vfsmount_lock); +		lock_mount_hash(); +		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; +		mnt->mnt.mnt_flags = mnt_flags; +		touch_mnt_namespace(mnt->mnt_ns); +		unlock_mount_hash();  	}  	up_write(&sb->s_umount); -	if (!err) { -		br_write_lock(vfsmount_lock); -		touch_mnt_namespace(path->mnt->mnt_ns); -		br_write_unlock(vfsmount_lock); -	}  	return err;  } -static inline int tree_contains_unbindable(struct vfsmount *mnt) +static inline int tree_contains_unbindable(struct mount *mnt)  { -	struct vfsmount *p; +	struct mount *p;  	for (p = mnt; p; p = next_mnt(p, mnt)) {  		if (IS_MNT_UNBINDABLE(p))  			return 1; @@ -1654,40 +1956,39 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt)  	return 0;  } -static int do_move_mount(struct path *path, char *old_name) +static int do_move_mount(struct path *path, const char *old_name)  {  	struct path old_path, parent_path; -	struct vfsmount *p; -	int err = 0; -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; +	struct mount *p; +	struct mount *old; +	struct mountpoint *mp; +	int err;  	if (!old_name || !*old_name)  		return -EINVAL;  	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);  	if (err)  		return err; -	down_write(&namespace_sem); -	while (d_mountpoint(path->dentry) && -	       follow_down(path)) -		; -	err = -EINVAL; -	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) +	mp = lock_mount(path); +	err = PTR_ERR(mp); +	if (IS_ERR(mp))  		goto out; -	err = -ENOENT; -	mutex_lock(&path->dentry->d_inode->i_mutex); -	if (cant_mount(path->dentry)) +	old = real_mount(old_path.mnt); +	p = real_mount(path->mnt); + +	err = -EINVAL; +	if (!check_mnt(p) || !check_mnt(old))  		goto out1; -	if (d_unlinked(path->dentry)) +	if (old->mnt.mnt_flags & MNT_LOCKED)  		goto out1;  	err = -EINVAL;  	if (old_path.dentry != old_path.mnt->mnt_root)  		goto out1; -	if (old_path.mnt == old_path.mnt->mnt_parent) +	if (!mnt_has_parent(old))  		goto out1;  	if (S_ISDIR(path->dentry->d_inode->i_mode) != @@ -1696,108 +1997,194 @@ static int do_move_mount(struct path *path, char *old_name)  	/*  	 * Don't move a mount residing in a shared parent.  	 */ -	if (old_path.mnt->mnt_parent && -	    IS_MNT_SHARED(old_path.mnt->mnt_parent)) +	if (IS_MNT_SHARED(old->mnt_parent))  		goto out1;  	/*  	 * Don't move a mount tree containing unbindable mounts to a destination  	 * mount which is shared.  	 */ -	if (IS_MNT_SHARED(path->mnt) && -	    tree_contains_unbindable(old_path.mnt)) +	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))  		goto out1;  	err = -ELOOP; -	for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) -		if (p == old_path.mnt) +	for (; mnt_has_parent(p); p = p->mnt_parent) +		if (p == old)  			goto out1; -	err = attach_recursive_mnt(old_path.mnt, path, &parent_path); +	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);  	if (err)  		goto out1;  	/* if the mount is moved, it should no longer be expire  	 * automatically */ -	list_del_init(&old_path.mnt->mnt_expire); +	list_del_init(&old->mnt_expire);  out1: -	mutex_unlock(&path->dentry->d_inode->i_mutex); +	unlock_mount(mp);  out: -	up_write(&namespace_sem);  	if (!err)  		path_put(&parent_path);  	path_put(&old_path);  	return err;  } -/* - * create a new mount for userspace and request it to be added into the - * namespace's tree - */ -static int do_new_mount(struct path *path, char *type, int flags, -			int mnt_flags, char *name, void *data) +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)  { -	struct vfsmount *mnt; - -	if (!type) -		return -EINVAL; - -	/* we need capabilities... */ -	if (!capable(CAP_SYS_ADMIN)) -		return -EPERM; +	int err; +	const char *subtype = strchr(fstype, '.'); +	if (subtype) { +		subtype++; +		err = -EINVAL; +		if (!subtype[0]) +			goto err; +	} else +		subtype = ""; -	mnt = do_kern_mount(type, flags, name, data); -	if (IS_ERR(mnt)) -		return PTR_ERR(mnt); +	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); +	err = -ENOMEM; +	if (!mnt->mnt_sb->s_subtype) +		goto err; +	return mnt; -	return do_add_mount(mnt, path, mnt_flags, NULL); + err: +	mntput(mnt); +	return ERR_PTR(err);  }  /*   * add a mount into a namespace's mount tree - * - provide the option of adding the new mount to an expiration list   */ -int do_add_mount(struct vfsmount *newmnt, struct path *path, -		 int mnt_flags, struct list_head *fslist) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)  { +	struct mountpoint *mp; +	struct mount *parent;  	int err; -	mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL); +	mnt_flags &= ~MNT_INTERNAL_FLAGS; -	down_write(&namespace_sem); -	/* Something was mounted here while we slept */ -	while (d_mountpoint(path->dentry) && -	       follow_down(path)) -		; +	mp = lock_mount(path); +	if (IS_ERR(mp)) +		return PTR_ERR(mp); + +	parent = real_mount(path->mnt);  	err = -EINVAL; -	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) -		goto unlock; +	if (unlikely(!check_mnt(parent))) { +		/* that's acceptable only for automounts done in private ns */ +		if (!(mnt_flags & MNT_SHRINKABLE)) +			goto unlock; +		/* ... and for those we'd better have mountpoint still alive */ +		if (!parent->mnt_ns) +			goto unlock; +	}  	/* Refuse the same filesystem on the same mount point */  	err = -EBUSY; -	if (path->mnt->mnt_sb == newmnt->mnt_sb && +	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&  	    path->mnt->mnt_root == path->dentry)  		goto unlock;  	err = -EINVAL; -	if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) +	if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))  		goto unlock; -	newmnt->mnt_flags = mnt_flags; -	if ((err = graft_tree(newmnt, path))) -		goto unlock; +	newmnt->mnt.mnt_flags = mnt_flags; +	err = graft_tree(newmnt, parent, mp); -	if (fslist) /* add to the specified expiration list */ -		list_add_tail(&newmnt->mnt_expire, fslist); +unlock: +	unlock_mount(mp); +	return err; +} -	up_write(&namespace_sem); -	return 0; +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, const char *fstype, int flags, +			int mnt_flags, const char *name, void *data) +{ +	struct file_system_type *type; +	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; +	struct vfsmount *mnt; +	int err; -unlock: -	up_write(&namespace_sem); -	mntput(newmnt); +	if (!fstype) +		return -EINVAL; + +	type = get_fs_type(fstype); +	if (!type) +		return -ENODEV; + +	if (user_ns != &init_user_ns) { +		if (!(type->fs_flags & FS_USERNS_MOUNT)) { +			put_filesystem(type); +			return -EPERM; +		} +		/* Only in special cases allow devices from mounts +		 * created outside the initial user namespace. +		 */ +		if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { +			flags |= MS_NODEV; +			mnt_flags |= MNT_NODEV; +		} +	} + +	mnt = vfs_kern_mount(type, flags, name, data); +	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && +	    !mnt->mnt_sb->s_subtype) +		mnt = fs_set_subtype(mnt, fstype); + +	put_filesystem(type); +	if (IS_ERR(mnt)) +		return PTR_ERR(mnt); + +	err = do_add_mount(real_mount(mnt), path, mnt_flags); +	if (err) +		mntput(mnt); +	return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ +	struct mount *mnt = real_mount(m); +	int err; +	/* The new mount record should have at least 2 refs to prevent it being +	 * expired before we get a chance to add it +	 */ +	BUG_ON(mnt_get_count(mnt) < 2); + +	if (m->mnt_sb == path->mnt->mnt_sb && +	    m->mnt_root == path->dentry) { +		err = -ELOOP; +		goto fail; +	} + +	err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); +	if (!err) +		return 0; +fail: +	/* remove m from any expiration list it may be on */ +	if (!list_empty(&mnt->mnt_expire)) { +		namespace_lock(); +		list_del_init(&mnt->mnt_expire); +		namespace_unlock(); +	} +	mntput(m); +	mntput(m);  	return err;  } -EXPORT_SYMBOL_GPL(do_add_mount); +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ +	namespace_lock(); + +	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); + +	namespace_unlock(); +} +EXPORT_SYMBOL(mnt_set_expiry);  /*   * process a list of expirable mountpoints with the intent of discarding any @@ -1806,15 +2193,14 @@ EXPORT_SYMBOL_GPL(do_add_mount);   */  void mark_mounts_for_expiry(struct list_head *mounts)  { -	struct vfsmount *mnt, *next; +	struct mount *mnt, *next;  	LIST_HEAD(graveyard); -	LIST_HEAD(umounts);  	if (list_empty(mounts))  		return; -	down_write(&namespace_sem); -	br_write_lock(vfsmount_lock); +	namespace_lock(); +	lock_mount_hash();  	/* extract from the expiration list every vfsmount that matches the  	 * following criteria: @@ -1829,14 +2215,12 @@ void mark_mounts_for_expiry(struct list_head *mounts)  		list_move(&mnt->mnt_expire, &graveyard);  	}  	while (!list_empty(&graveyard)) { -		mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); +		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);  		touch_mnt_namespace(mnt->mnt_ns); -		umount_tree(mnt, 1, &umounts); +		umount_tree(mnt, 1);  	} -	br_write_unlock(vfsmount_lock); -	up_write(&namespace_sem); - -	release_mounts(&umounts); +	unlock_mount_hash(); +	namespace_unlock();  }  EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -1847,9 +2231,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);   * search the list of submounts for a given mountpoint, and move any   * shrinkable submounts to the 'graveyard' list.   */ -static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +static int select_submounts(struct mount *parent, struct list_head *graveyard)  { -	struct vfsmount *this_parent = parent; +	struct mount *this_parent = parent;  	struct list_head *next;  	int found = 0; @@ -1858,10 +2242,10 @@ repeat:  resume:  	while (next != &this_parent->mnt_mounts) {  		struct list_head *tmp = next; -		struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); +		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);  		next = tmp->next; -		if (!(mnt->mnt_flags & MNT_SHRINKABLE)) +		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))  			continue;  		/*  		 * Descend a level if the d_mounts list is non-empty. @@ -1891,20 +2275,20 @@ resume:   * process a list of expirable mountpoints with the intent of discarding any   * submounts of a specific parent mountpoint   * - * vfsmount_lock must be held for write + * mount_lock must be held for write   */ -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt)  {  	LIST_HEAD(graveyard); -	struct vfsmount *m; +	struct mount *m;  	/* extract submounts of 'mountpoint' from the expiration list */  	while (select_submounts(mnt, &graveyard)) {  		while (!list_empty(&graveyard)) { -			m = list_first_entry(&graveyard, struct vfsmount, +			m = list_first_entry(&graveyard, struct mount,  						mnt_expire);  			touch_mnt_namespace(m->mnt_ns); -			umount_tree(m, 1, umounts); +			umount_tree(m, 1);  		}  	}  } @@ -2001,8 +2385,8 @@ int copy_mount_string(const void __user *data, char **where)   * Therefore, if this magic number is present, it carries no information   * and must be discarded.   */ -long do_mount(char *dev_name, char *dir_name, char *type_page, -		  unsigned long flags, void *data_page) +long do_mount(const char *dev_name, const char *dir_name, +		const char *type_page, unsigned long flags, void *data_page)  {  	struct path path;  	int retval = 0; @@ -2027,6 +2411,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,  	retval = security_sb_mount(dev_name, &path,  				   type_page, flags, data_page); +	if (!retval && !may_mount()) +		retval = -EPERM;  	if (retval)  		goto dput_out; @@ -2071,72 +2457,109 @@ dput_out:  	return retval;  } -static struct mnt_namespace *alloc_mnt_ns(void) +static void free_mnt_ns(struct mnt_namespace *ns) +{ +	proc_free_inum(ns->proc_inum); +	put_user_ns(ns->user_ns); +	kfree(ns); +} + +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops.  A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)  {  	struct mnt_namespace *new_ns; +	int ret;  	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);  	if (!new_ns)  		return ERR_PTR(-ENOMEM); +	ret = proc_alloc_inum(&new_ns->proc_inum); +	if (ret) { +		kfree(new_ns); +		return ERR_PTR(ret); +	} +	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);  	atomic_set(&new_ns->count, 1);  	new_ns->root = NULL;  	INIT_LIST_HEAD(&new_ns->list);  	init_waitqueue_head(&new_ns->poll);  	new_ns->event = 0; +	new_ns->user_ns = get_user_ns(user_ns);  	return new_ns;  } -/* - * Allocate a new namespace structure and populate it with contents - * copied from the namespace of the passed in task structure. - */ -static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, -		struct fs_struct *fs) +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +		struct user_namespace *user_ns, struct fs_struct *new_fs)  {  	struct mnt_namespace *new_ns;  	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; -	struct vfsmount *p, *q; +	struct mount *p, *q; +	struct mount *old; +	struct mount *new; +	int copy_flags; + +	BUG_ON(!ns); -	new_ns = alloc_mnt_ns(); +	if (likely(!(flags & CLONE_NEWNS))) { +		get_mnt_ns(ns); +		return ns; +	} + +	old = ns->root; + +	new_ns = alloc_mnt_ns(user_ns);  	if (IS_ERR(new_ns))  		return new_ns; -	down_write(&namespace_sem); +	namespace_lock();  	/* First pass: copy the tree topology */ -	new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, -					CL_COPY_ALL | CL_EXPIRE); -	if (!new_ns->root) { -		up_write(&namespace_sem); -		kfree(new_ns); -		return ERR_PTR(-ENOMEM); +	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; +	if (user_ns != ns->user_ns) +		copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; +	new = copy_tree(old, old->mnt.mnt_root, copy_flags); +	if (IS_ERR(new)) { +		namespace_unlock(); +		free_mnt_ns(new_ns); +		return ERR_CAST(new);  	} -	br_write_lock(vfsmount_lock); -	list_add_tail(&new_ns->list, &new_ns->root->mnt_list); -	br_write_unlock(vfsmount_lock); +	new_ns->root = new; +	list_add_tail(&new_ns->list, &new->mnt_list);  	/*  	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts  	 * as belonging to new namespace.  We have already acquired a private  	 * fs_struct, so tsk->fs->lock is not needed.  	 */ -	p = mnt_ns->root; -	q = new_ns->root; +	p = old; +	q = new;  	while (p) {  		q->mnt_ns = new_ns; -		if (fs) { -			if (p == fs->root.mnt) { -				rootmnt = p; -				fs->root.mnt = mntget(q); +		if (new_fs) { +			if (&p->mnt == new_fs->root.mnt) { +				new_fs->root.mnt = mntget(&q->mnt); +				rootmnt = &p->mnt;  			} -			if (p == fs->pwd.mnt) { -				pwdmnt = p; -				fs->pwd.mnt = mntget(q); +			if (&p->mnt == new_fs->pwd.mnt) { +				new_fs->pwd.mnt = mntget(&q->mnt); +				pwdmnt = &p->mnt;  			}  		} -		p = next_mnt(p, mnt_ns->root); -		q = next_mnt(q, new_ns->root); +		p = next_mnt(p, old); +		q = next_mnt(q, new); +		if (!q) +			break; +		while (p->mnt.mnt_root != q->mnt.mnt_root) +			p = next_mnt(p, old);  	} -	up_write(&namespace_sem); +	namespace_unlock();  	if (rootmnt)  		mntput(rootmnt); @@ -2146,47 +2569,60 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,  	return new_ns;  } -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, -		struct fs_struct *new_fs) -{ -	struct mnt_namespace *new_ns; - -	BUG_ON(!ns); -	get_mnt_ns(ns); - -	if (!(flags & CLONE_NEWNS)) -		return ns; - -	new_ns = dup_mnt_ns(ns, new_fs); - -	put_mnt_ns(ns); -	return new_ns; -} -  /**   * create_mnt_ns - creates a private namespace and adds a root filesystem   * @mnt: pointer to the new root filesystem mountpoint   */ -struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)  { -	struct mnt_namespace *new_ns; - -	new_ns = alloc_mnt_ns(); +	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);  	if (!IS_ERR(new_ns)) { +		struct mount *mnt = real_mount(m);  		mnt->mnt_ns = new_ns;  		new_ns->root = mnt; -		list_add(&new_ns->list, &new_ns->root->mnt_list); +		list_add(&mnt->mnt_list, &new_ns->list); +	} else { +		mntput(m);  	}  	return new_ns;  } -EXPORT_SYMBOL(create_mnt_ns); + +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ +	struct mnt_namespace *ns; +	struct super_block *s; +	struct path path; +	int err; + +	ns = create_mnt_ns(mnt); +	if (IS_ERR(ns)) +		return ERR_CAST(ns); + +	err = vfs_path_lookup(mnt->mnt_root, mnt, +			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + +	put_mnt_ns(ns); + +	if (err) +		return ERR_PTR(err); + +	/* trade a vfsmount reference for active sb one */ +	s = path.mnt->mnt_sb; +	atomic_inc(&s->s_active); +	mntput(path.mnt); +	/* lock the sucker */ +	down_write(&s->s_umount); +	/* ... and return the root of (sub)tree on it */ +	return path.dentry; +} +EXPORT_SYMBOL(mount_subtree);  SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,  		char __user *, type, unsigned long, flags, void __user *, data)  {  	int ret;  	char *kernel_type; -	char *kernel_dir; +	struct filename *kernel_dir;  	char *kernel_dev;  	unsigned long data_page; @@ -2208,7 +2644,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,  	if (ret < 0)  		goto out_data; -	ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, +	ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,  		(void *) data_page);  	free_page(data_page); @@ -2223,6 +2659,31 @@ out_type:  }  /* + * Return true if path is reachable from root + * + * namespace_sem or mount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, +			 const struct path *root) +{ +	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { +		dentry = mnt->mnt_mountpoint; +		mnt = mnt->mnt_parent; +	} +	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +int path_is_under(struct path *path1, struct path *path2) +{ +	int res; +	read_seqlock_excl(&mount_lock); +	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); +	read_sequnlock_excl(&mount_lock); +	return res; +} +EXPORT_SYMBOL(path_is_under); + +/*   * pivot_root Semantics:   * Moves the root file system of the current process to the directory put_old,   * makes new_root as the new root file system of the current process, and sets @@ -2250,99 +2711,94 @@ out_type:  SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,  		const char __user *, put_old)  { -	struct vfsmount *tmp;  	struct path new, old, parent_path, root_parent, root; +	struct mount *new_mnt, *root_mnt, *old_mnt; +	struct mountpoint *old_mp, *root_mp;  	int error; -	if (!capable(CAP_SYS_ADMIN)) +	if (!may_mount())  		return -EPERM;  	error = user_path_dir(new_root, &new);  	if (error)  		goto out0; -	error = -EINVAL; -	if (!check_mnt(new.mnt)) -		goto out1;  	error = user_path_dir(put_old, &old);  	if (error)  		goto out1;  	error = security_sb_pivotroot(&old, &new); -	if (error) { -		path_put(&old); -		goto out1; -	} +	if (error) +		goto out2;  	get_fs_root(current->fs, &root); -	down_write(&namespace_sem); -	mutex_lock(&old.dentry->d_inode->i_mutex); +	old_mp = lock_mount(&old); +	error = PTR_ERR(old_mp); +	if (IS_ERR(old_mp)) +		goto out3; +  	error = -EINVAL; -	if (IS_MNT_SHARED(old.mnt) || -		IS_MNT_SHARED(new.mnt->mnt_parent) || -		IS_MNT_SHARED(root.mnt->mnt_parent)) -		goto out2; -	if (!check_mnt(root.mnt)) -		goto out2; +	new_mnt = real_mount(new.mnt); +	root_mnt = real_mount(root.mnt); +	old_mnt = real_mount(old.mnt); +	if (IS_MNT_SHARED(old_mnt) || +		IS_MNT_SHARED(new_mnt->mnt_parent) || +		IS_MNT_SHARED(root_mnt->mnt_parent)) +		goto out4; +	if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) +		goto out4; +	if (new_mnt->mnt.mnt_flags & MNT_LOCKED) +		goto out4;  	error = -ENOENT; -	if (cant_mount(old.dentry)) -		goto out2;  	if (d_unlinked(new.dentry)) -		goto out2; -	if (d_unlinked(old.dentry)) -		goto out2; +		goto out4;  	error = -EBUSY; -	if (new.mnt == root.mnt || -	    old.mnt == root.mnt) -		goto out2; /* loop, on the same file system  */ +	if (new_mnt == root_mnt || old_mnt == root_mnt) +		goto out4; /* loop, on the same file system  */  	error = -EINVAL;  	if (root.mnt->mnt_root != root.dentry) -		goto out2; /* not a mountpoint */ -	if (root.mnt->mnt_parent == root.mnt) -		goto out2; /* not attached */ +		goto out4; /* not a mountpoint */ +	if (!mnt_has_parent(root_mnt)) +		goto out4; /* not attached */ +	root_mp = root_mnt->mnt_mp;  	if (new.mnt->mnt_root != new.dentry) -		goto out2; /* not a mountpoint */ -	if (new.mnt->mnt_parent == new.mnt) -		goto out2; /* not attached */ +		goto out4; /* not a mountpoint */ +	if (!mnt_has_parent(new_mnt)) +		goto out4; /* not attached */  	/* make sure we can reach put_old from new_root */ -	tmp = old.mnt; -	br_write_lock(vfsmount_lock); -	if (tmp != new.mnt) { -		for (;;) { -			if (tmp->mnt_parent == tmp) -				goto out3; /* already mounted on put_old */ -			if (tmp->mnt_parent == new.mnt) -				break; -			tmp = tmp->mnt_parent; -		} -		if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) -			goto out3; -	} else if (!is_subdir(old.dentry, new.dentry)) -		goto out3; -	detach_mnt(new.mnt, &parent_path); -	detach_mnt(root.mnt, &root_parent); +	if (!is_path_reachable(old_mnt, old.dentry, &new)) +		goto out4; +	root_mp->m_count++; /* pin it so it won't go away */ +	lock_mount_hash(); +	detach_mnt(new_mnt, &parent_path); +	detach_mnt(root_mnt, &root_parent); +	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { +		new_mnt->mnt.mnt_flags |= MNT_LOCKED; +		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; +	}  	/* mount old root on put_old */ -	attach_mnt(root.mnt, &old); +	attach_mnt(root_mnt, old_mnt, old_mp);  	/* mount new_root on / */ -	attach_mnt(new.mnt, &root_parent); +	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);  	touch_mnt_namespace(current->nsproxy->mnt_ns); -	br_write_unlock(vfsmount_lock); +	unlock_mount_hash();  	chroot_fs_refs(&root, &new); +	put_mountpoint(root_mp);  	error = 0; -	path_put(&root_parent); -	path_put(&parent_path); -out2: -	mutex_unlock(&old.dentry->d_inode->i_mutex); -	up_write(&namespace_sem); +out4: +	unlock_mount(old_mp); +	if (!error) { +		path_put(&root_parent); +		path_put(&parent_path); +	} +out3:  	path_put(&root); +out2:  	path_put(&old);  out1:  	path_put(&new);  out0:  	return error; -out3: -	br_write_unlock(vfsmount_lock); -	goto out2;  }  static void __init init_mount_tree(void) @@ -2350,10 +2806,16 @@ static void __init init_mount_tree(void)  	struct vfsmount *mnt;  	struct mnt_namespace *ns;  	struct path root; +	struct file_system_type *type; -	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); +	type = get_fs_type("rootfs"); +	if (!type) +		panic("Can't find rootfs type"); +	mnt = vfs_kern_mount(type, 0, "rootfs", NULL); +	put_filesystem(type);  	if (IS_ERR(mnt))  		panic("Can't create rootfs"); +  	ns = create_mnt_ns(mnt);  	if (IS_ERR(ns))  		panic("Can't allocate initial namespace"); @@ -2361,8 +2823,8 @@ static void __init init_mount_tree(void)  	init_task.nsproxy->mnt_ns = ns;  	get_mnt_ns(ns); -	root.mnt = ns->root; -	root.dentry = ns->root->mnt_root; +	root.mnt = mnt; +	root.dentry = mnt->mnt_root;  	set_fs_pwd(current->fs, &root);  	set_fs_root(current->fs, &root); @@ -2373,22 +2835,29 @@ void __init mnt_init(void)  	unsigned u;  	int err; -	init_rwsem(&namespace_sem); - -	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), +	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),  			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); -	mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - -	if (!mount_hashtable) +	mount_hashtable = alloc_large_system_hash("Mount-cache", +				sizeof(struct hlist_head), +				mhash_entries, 19, +				0, +				&m_hash_shift, &m_hash_mask, 0, 0); +	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", +				sizeof(struct hlist_head), +				mphash_entries, 19, +				0, +				&mp_hash_shift, &mp_hash_mask, 0, 0); + +	if (!mount_hashtable || !mountpoint_hashtable)  		panic("Failed to allocate mount hash table\n"); -	printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); - -	for (u = 0; u < HASH_SIZE; u++) -		INIT_LIST_HEAD(&mount_hashtable[u]); +	for (u = 0; u <= m_hash_mask; u++) +		INIT_HLIST_HEAD(&mount_hashtable[u]); +	for (u = 0; u <= mp_hash_mask; u++) +		INIT_HLIST_HEAD(&mountpoint_hashtable[u]); -	br_lock_init(vfsmount_lock); +	kernfs_init();  	err = sysfs_init();  	if (err) @@ -2403,16 +2872,166 @@ void __init mnt_init(void)  void put_mnt_ns(struct mnt_namespace *ns)  { -	LIST_HEAD(umount_list); -  	if (!atomic_dec_and_test(&ns->count))  		return; -	down_write(&namespace_sem); -	br_write_lock(vfsmount_lock); -	umount_tree(ns->root, 0, &umount_list); -	br_write_unlock(vfsmount_lock); -	up_write(&namespace_sem); -	release_mounts(&umount_list); -	kfree(ns); +	drop_collected_mounts(&ns->root->mnt); +	free_mnt_ns(ns);  } -EXPORT_SYMBOL(put_mnt_ns); + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ +	struct vfsmount *mnt; +	mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); +	if (!IS_ERR(mnt)) { +		/* +		 * it is a longterm mount, don't release mnt until +		 * we unmount before file sys is unregistered +		*/ +		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; +	} +	return mnt; +} +EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ +	/* release long term mount so mount point can be released */ +	if (!IS_ERR_OR_NULL(mnt)) { +		real_mount(mnt)->mnt_ns = NULL; +		synchronize_rcu();	/* yecchhh... */ +		mntput(mnt); +	} +} +EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ +	return check_mnt(real_mount(mnt)); +} + +bool current_chrooted(void) +{ +	/* Does the current process have a non-standard root */ +	struct path ns_root; +	struct path fs_root; +	bool chrooted; + +	/* Find the namespace root */ +	ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; +	ns_root.dentry = ns_root.mnt->mnt_root; +	path_get(&ns_root); +	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) +		; + +	get_fs_root(current->fs, &fs_root); + +	chrooted = !path_equal(&fs_root, &ns_root); + +	path_put(&fs_root); +	path_put(&ns_root); + +	return chrooted; +} + +bool fs_fully_visible(struct file_system_type *type) +{ +	struct mnt_namespace *ns = current->nsproxy->mnt_ns; +	struct mount *mnt; +	bool visible = false; + +	if (unlikely(!ns)) +		return false; + +	down_read(&namespace_sem); +	list_for_each_entry(mnt, &ns->list, mnt_list) { +		struct mount *child; +		if (mnt->mnt.mnt_sb->s_type != type) +			continue; + +		/* This mount is not fully visible if there are any child mounts +		 * that cover anything except for empty directories. +		 */ +		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { +			struct inode *inode = child->mnt_mountpoint->d_inode; +			if (!S_ISDIR(inode->i_mode)) +				goto next; +			if (inode->i_nlink > 2) +				goto next; +		} +		visible = true; +		goto found; +	next:	; +	} +found: +	up_read(&namespace_sem); +	return visible; +} + +static void *mntns_get(struct task_struct *task) +{ +	struct mnt_namespace *ns = NULL; +	struct nsproxy *nsproxy; + +	rcu_read_lock(); +	nsproxy = task_nsproxy(task); +	if (nsproxy) { +		ns = nsproxy->mnt_ns; +		get_mnt_ns(ns); +	} +	rcu_read_unlock(); + +	return ns; +} + +static void mntns_put(void *ns) +{ +	put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ +	struct fs_struct *fs = current->fs; +	struct mnt_namespace *mnt_ns = ns; +	struct path root; + +	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || +	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || +	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) +		return -EPERM; + +	if (fs->users != 1) +		return -EINVAL; + +	get_mnt_ns(mnt_ns); +	put_mnt_ns(nsproxy->mnt_ns); +	nsproxy->mnt_ns = mnt_ns; + +	/* Find the root */ +	root.mnt    = &mnt_ns->root->mnt; +	root.dentry = mnt_ns->root->mnt.mnt_root; +	path_get(&root); +	while(d_mountpoint(root.dentry) && follow_down_one(&root)) +		; + +	/* Update the pwd and root */ +	set_fs_pwd(fs, &root); +	set_fs_root(fs, &root); + +	path_put(&root); +	return 0; +} + +static unsigned int mntns_inum(void *ns) +{ +	struct mnt_namespace *mnt_ns = ns; +	return mnt_ns->proc_inum; +} + +const struct proc_ns_operations mntns_operations = { +	.name		= "mnt", +	.type		= CLONE_NEWNS, +	.get		= mntns_get, +	.put		= mntns_put, +	.install	= mntns_install, +	.inum		= mntns_inum, +};  | 
