diff options
Diffstat (limited to 'fs/namespace.c')
| -rw-r--r-- | fs/namespace.c | 2424 |
1 files changed, 1568 insertions, 856 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 7d70d63ceb2..182bc41cd88 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -9,87 +9,119 @@ */ #include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/acct.h> +#include <linux/export.h> #include <linux/capability.h> -#include <linux/cpumask.h> -#include <linux/module.h> -#include <linux/sysfs.h> -#include <linux/seq_file.h> #include <linux/mnt_namespace.h> +#include <linux/user_namespace.h> #include <linux/namei.h> -#include <linux/nsproxy.h> #include <linux/security.h> -#include <linux/mount.h> -#include <linux/ramfs.h> -#include <linux/log2.h> #include <linux/idr.h> -#include <linux/fs_struct.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/acct.h> /* acct_auto_close_mnt */ +#include <linux/init.h> /* init_rootfs */ +#include <linux/fs_struct.h> /* get_fs_root et.al. */ +#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ +#include <linux/uaccess.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> +#include <linux/bootmem.h> #include "pnode.h" #include "internal.h" -#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) -#define HASH_SIZE (1UL << HASH_SHIFT) +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; -/* spinlock for vfsmount related operations, inplace of dcache_lock */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); -static int event; +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); static int mnt_id_start = 0; static int mnt_group_start = 1; -static struct list_head *mount_hashtable __read_mostly; +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +static DECLARE_RWSEM(namespace_sem); /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); -static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); + +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); - tmp = tmp + (tmp >> HASH_SHIFT); - return tmp & (HASH_SIZE - 1); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; } -#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; +} -/* allocation is serialized by namespace_sem */ -static int mnt_alloc_id(struct vfsmount *mnt) +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialize with freeing. + */ +static int mnt_alloc_id(struct mount *mnt) { int res; retry: ida_pre_get(&mnt_id_ida, GFP_KERNEL); - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); if (!res) mnt_id_start = mnt->mnt_id + 1; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); if (res == -EAGAIN) goto retry; return res; } -static void mnt_free_id(struct vfsmount *mnt) +static void mnt_free_id(struct mount *mnt) { int id = mnt->mnt_id; - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); ida_remove(&mnt_id_ida, id); if (mnt_id_start > id) mnt_id_start = id; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); } /* @@ -97,7 +129,7 @@ static void mnt_free_id(struct vfsmount *mnt) * * mnt_group_ida is protected by namespace_sem */ -static int mnt_alloc_group_id(struct vfsmount *mnt) +static int mnt_alloc_group_id(struct mount *mnt) { int res; @@ -116,7 +148,7 @@ static int mnt_alloc_group_id(struct vfsmount *mnt) /* * Release a peer group ID */ -void mnt_release_group_id(struct vfsmount *mnt) +void mnt_release_group_id(struct mount *mnt) { int id = mnt->mnt_group_id; ida_remove(&mnt_group_ida, id); @@ -125,9 +157,42 @@ void mnt_release_group_id(struct vfsmount *mnt) mnt->mnt_group_id = 0; } -struct vfsmount *alloc_vfsmnt(const char *name) +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct mount *mnt, int n) { - struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif +} + +static struct mount *alloc_vfsmnt(const char *name) +{ + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; @@ -141,8 +206,18 @@ struct vfsmount *alloc_vfsmnt(const char *name) goto out_free_id; } - atomic_set(&mnt->mnt_count, 1); - INIT_LIST_HEAD(&mnt->mnt_hash); +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + + INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -150,12 +225,8 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); -#ifdef CONFIG_SMP - mnt->mnt_writers = alloc_percpu(int); - if (!mnt->mnt_writers) - goto out_free_devname; -#else - mnt->mnt_writers = 0; +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif } return mnt; @@ -200,32 +271,32 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -static inline void inc_mnt_writers(struct vfsmount *mnt) +static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; + this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } -static inline void dec_mnt_writers(struct vfsmount *mnt) +static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP - (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; + this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } -static unsigned int count_mnt_writers(struct vfsmount *mnt) +static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { - count += *per_cpu_ptr(mnt->mnt_writers, cpu); + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; @@ -234,37 +305,45 @@ static unsigned int count_mnt_writers(struct vfsmount *mnt) #endif } +static int mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. */ /** - * mnt_want_write - get write access to a mount - * @mnt: the mount on which to take a write + * __mnt_want_write - get write access to a mount without freeze protection + * @m: the mount on which to take a write * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success. When - * the write operation is finished, mnt_drop_write() - * must be called. This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *mnt) +int __mnt_want_write(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); - inc_mnt_writers(mnt); + mnt_inc_writers(mnt); /* - * The store to inc_mnt_writers must be visible before we pass + * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (mnt->mnt_flags & MNT_WRITE_HOLD) + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will @@ -272,13 +351,32 @@ int mnt_want_write(struct vfsmount *mnt) * MNT_WRITE_HOLD is cleared. */ smp_rmb(); - if (__mnt_is_readonly(mnt)) { - dec_mnt_writers(mnt); + if (mnt_is_readonly(m)) { + mnt_dec_writers(mnt); ret = -EROFS; - goto out; } -out: preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); @@ -301,13 +399,28 @@ int mnt_clone_write(struct vfsmount *mnt) if (__mnt_is_readonly(mnt)) return -EROFS; preempt_disable(); - inc_mnt_writers(mnt); + mnt_inc_writers(real_mount(mnt)); preempt_enable(); return 0; } EXPORT_SYMBOL_GPL(mnt_clone_write); /** + * __mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like __mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int __mnt_want_write_file(struct file *file) +{ + if (!(file->f_mode & FMODE_WRITER)) + return __mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} + +/** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * @@ -316,36 +429,63 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int mnt_want_write_file(struct file *file) { - struct inode *inode = file->f_dentry->d_inode; - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) - return mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above. */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); - dec_mnt_writers(mnt); + mnt_dec_writers(real_mount(mnt)); preempt_enable(); } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} EXPORT_SYMBOL_GPL(mnt_drop_write); -static int mnt_make_readonly(struct vfsmount *mnt) +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + +void mnt_drop_write_file(struct file *file) +{ + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); + +static int mnt_make_readonly(struct mount *mnt) { int ret = 0; - spin_lock(&vfsmount_lock); - mnt->mnt_flags |= MNT_WRITE_HOLD; + lock_mount_hash(); + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. @@ -368,89 +508,217 @@ static int mnt_make_readonly(struct vfsmount *mnt) * MNT_WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ - if (count_mnt_writers(mnt) > 0) + if (mnt_get_writers(mnt) > 0) ret = -EBUSY; else - mnt->mnt_flags |= MNT_READONLY; + mnt->mnt.mnt_flags |= MNT_READONLY; /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt_flags &= ~MNT_WRITE_HOLD; - spin_unlock(&vfsmount_lock); + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + unlock_mount_hash(); return ret; } -static void __mnt_unmake_readonly(struct vfsmount *mnt) +static void __mnt_unmake_readonly(struct mount *mnt) { - spin_lock(&vfsmount_lock); - mnt->mnt_flags &= ~MNT_READONLY; - spin_unlock(&vfsmount_lock); + lock_mount_hash(); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + unlock_mount_hash(); } -void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +int sb_prepare_remount_readonly(struct super_block *sb) { - mnt->mnt_sb = sb; - mnt->mnt_root = dget(sb->s_root); -} + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } + unlock_mount_hash(); -EXPORT_SYMBOL(simple_set_mnt); + return err; +} -void free_vfsmnt(struct vfsmount *mnt) +static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP - free_percpu(mnt->mnt_writers); + free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return false; + if (bastard == NULL) + return true; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + if (likely(!read_seqretry(&mount_lock, seq))) + return true; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return false; + } + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + return false; +} + /* - * find the first or last mount at @dentry on vfsmount @mnt depending on - * @dir. If @dir is set return the first mount else return the last mount. + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock() */ -struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, - int dir) +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = mount_hashtable + hash(mnt, dentry); - struct list_head *tmp = head; - struct vfsmount *p, *found = NULL; + struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; - for (;;) { - tmp = dir ? tmp->next : tmp->prev; - p = NULL; - if (tmp == head) - break; - p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { - found = p; + hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +} + +/* + * find the last mount at @dentry on vfsmount @mnt. + * mount_lock must be held. + */ +struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) +{ + struct mount *p, *res; + res = p = __lookup_mnt(mnt, dentry); + if (!p) + goto out; + hlist_for_each_entry_continue(p, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - } + res = p; } - return found; +out: + return res; } /* - * lookup_mnt increments the ref count before returning - * the vfsmount struct. + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically. If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(struct path *path) { - struct vfsmount *child_mnt; - spin_lock(&vfsmount_lock); - if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) - mntget(child_mnt); - spin_unlock(&vfsmount_lock); - return child_mnt; + struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret; + + hlist_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + ret = d_set_mounted(dentry); + if (ret) { + kfree(mp); + return ERR_PTR(ret); + } + + mp->m_dentry = dentry; + mp->m_count = 1; + hlist_add_head(&mp->m_hash, chain); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + hlist_del(&mp->m_hash); + kfree(mp); + } } -static inline int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } +/* + * vfsmount lock must be held for write + */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { @@ -459,6 +727,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns) } } +/* + * vfsmount lock must be held for write + */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { @@ -467,40 +738,54 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } } -static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +/* + * vfsmount lock must be held for write + */ +static void detach_mnt(struct mount *mnt, struct path *old_path) { old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = mnt->mnt_parent; + old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); - old_path->dentry->d_mounted--; + hlist_del_init_rcu(&mnt->mnt_hash); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; } -void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, - struct vfsmount *child_mnt) +/* + * vfsmount lock must be held for write + */ +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, + struct mount *child_mnt) { - child_mnt->mnt_parent = mntget(mnt); - child_mnt->mnt_mountpoint = dget(dentry); - dentry->d_mounted++; + mp->m_count++; + mnt_add_count(mnt, 1); /* essentially, that's mntget */ + child_mnt->mnt_mountpoint = dget(mp->m_dentry); + child_mnt->mnt_parent = mnt; + child_mnt->mnt_mp = mp; } -static void attach_mnt(struct vfsmount *mnt, struct path *path) +/* + * vfsmount lock must be held for write + */ +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) { - mnt_set_mountpoint(path->mnt, path->dentry, mnt); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); + mnt_set_mountpoint(parent, mp, mnt); + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* - * the caller must hold vfsmount_lock + * vfsmount lock must be held for write */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct mount *mnt, struct mount *shadows) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; @@ -509,15 +794,19 @@ static void commit_tree(struct vfsmount *mnt) list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) m->mnt_ns = n; + list_splice(&head, n->list.prev); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(parent, mnt->mnt_mountpoint)); + if (shadows) + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + else + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { @@ -530,75 +819,157 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) p = p->mnt_parent; } } - return list_entry(next, struct vfsmount, mnt_child); + return list_entry(next, struct mount, mnt_child); } -static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { - p = list_entry(prev, struct vfsmount, mnt_child); + p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } -static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +{ + struct mount *mnt; + struct dentry *root; + + if (!type) + return ERR_PTR(-ENODEV); + + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flags & MS_KERNMOUNT) + mnt->mnt.mnt_flags = MNT_INTERNAL; + + root = mount_fs(type, flags, name, data); + if (IS_ERR(root)) { + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_CAST(root); + } + + mnt->mnt.mnt_root = root; + mnt->mnt.mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + unlock_mount_hash(); + return &mnt->mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt_sb; - struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt; + int err; - if (mnt) { - if (flag & (CL_SLAVE | CL_PRIVATE)) - mnt->mnt_group_id = 0; /* not a peer of original */ - else - mnt->mnt_group_id = old->mnt_group_id; - - if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { - int err = mnt_alloc_group_id(mnt); - if (err) - goto out_free; - } + mnt = alloc_vfsmnt(old->mnt_devname); + if (!mnt) + return ERR_PTR(-ENOMEM); - mnt->mnt_flags = old->mnt_flags; - atomic_inc(&sb->s_active); - mnt->mnt_sb = sb; - mnt->mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - - if (flag & CL_SLAVE) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); - mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); - mnt->mnt_master = old->mnt_master; - } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + mnt->mnt.mnt_flags |= MNT_LOCKED; + + atomic_inc(&sb->s_active); + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + unlock_mount_hash(); + + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); } + return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); - return NULL; + return ERR_PTR(err); } -static inline void __mntput(struct vfsmount *mnt) +static void mntput_no_expire(struct mount *mnt) { - struct super_block *sb = mnt->mnt_sb; +put_again: + rcu_read_lock(); + mnt_add_count(mnt, -1); + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + rcu_read_unlock(); + return; + } + lock_mount_hash(); + if (mnt_get_count(mnt)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + if (unlikely(mnt->mnt_pinned)) { + mnt_add_count(mnt, mnt->mnt_pinned + 1); + mnt->mnt_pinned = 0; + rcu_read_unlock(); + unlock_mount_hash(); + acct_auto_close_mnt(&mnt->mnt); + goto put_again; + } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); + + list_del(&mnt->mnt_instance); + unlock_mount_hash(); + /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this @@ -606,54 +977,55 @@ static inline void __mntput(struct vfsmount *mnt) * to make r/w->r/o transitions. */ /* - * atomic_dec_and_lock() used to deal with ->mnt_count decrements - * provides barriers, so count_mnt_writers() below is safe. AV + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. */ - WARN_ON(count_mnt_writers(mnt)); - dput(mnt->mnt_root); - free_vfsmnt(mnt); - deactivate_super(sb); + WARN_ON(mnt_get_writers(mnt)); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } -void mntput_no_expire(struct vfsmount *mnt) +void mntput(struct vfsmount *mnt) { -repeat: - if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { - if (likely(!mnt->mnt_pinned)) { - spin_unlock(&vfsmount_lock); - __mntput(mnt); - return; - } - atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); - mnt->mnt_pinned = 0; - spin_unlock(&vfsmount_lock); - acct_auto_close_mnt(mnt); - security_sb_umount_close(mnt); - goto repeat; + if (mnt) { + struct mount *m = real_mount(mnt); + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); } } +EXPORT_SYMBOL(mntput); -EXPORT_SYMBOL(mntput_no_expire); +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_add_count(real_mount(mnt), 1); + return mnt; +} +EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); - mnt->mnt_pinned++; - spin_unlock(&vfsmount_lock); + lock_mount_hash(); + real_mount(mnt)->mnt_pinned++; + unlock_mount_hash(); } - EXPORT_SYMBOL(mnt_pin); -void mnt_unpin(struct vfsmount *mnt) +void mnt_unpin(struct vfsmount *m) { - spin_lock(&vfsmount_lock); + struct mount *mnt = real_mount(m); + lock_mount_hash(); if (mnt->mnt_pinned) { - atomic_inc(&mnt->mnt_count); + mnt_add_count(mnt, 1); mnt->mnt_pinned--; } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); } - EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) @@ -667,12 +1039,12 @@ static inline void mangle(struct seq_file *m, const char *s) * * See also save_mount_options(). */ -int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +int generic_show_options(struct seq_file *m, struct dentry *root) { const char *options; rcu_read_lock(); - options = rcu_dereference(mnt->mnt_sb->s_options); + options = rcu_dereference(root->d_sb->s_options); if (options != NULL && options[0]) { seq_putc(m, ','); @@ -716,20 +1088,35 @@ void replace_mount_options(struct super_block *sb, char *options) EXPORT_SYMBOL(replace_mount_options); #ifdef CONFIG_PROC_FS -/* iterator */ +/* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = m->private; + struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -737,186 +1124,18 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -struct proc_fs_info { - int flag; - const char *str; -}; - -static int show_sb_opts(struct seq_file *m, struct super_block *sb) -{ - static const struct proc_fs_info fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - - return security_sb_show_options(m, sb); -} - -static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) -{ - static const struct proc_fs_info mnt_info[] = { - { MNT_NOSUID, ",nosuid" }, - { MNT_NODEV, ",nodev" }, - { MNT_NOEXEC, ",noexec" }, - { MNT_NOATIME, ",noatime" }, - { MNT_NODIRATIME, ",nodiratime" }, - { MNT_RELATIME, ",relatime" }, - { MNT_STRICTATIME, ",strictatime" }, - { 0, NULL } - }; - const struct proc_fs_info *fs_infop; - - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } -} - -static void show_type(struct seq_file *m, struct super_block *sb) +static int m_show(struct seq_file *m, void *v) { - mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, sb->s_subtype); - } -} - -static int show_vfsmnt(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - show_type(m, mnt->mnt_sb); - seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - err = show_sb_opts(m, mnt->mnt_sb); - if (err) - goto out; - show_mnt_opts(m, mnt); - if (mnt->mnt_sb->s_op->show_options) - err = mnt->mnt_sb->s_op->show_options(m, mnt); - seq_puts(m, " 0 0\n"); -out: - return err; + struct proc_mounts *p = proc_mounts(m); + struct mount *r = list_entry(v, struct mount, mnt_list); + return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_vfsmnt -}; - -static int show_mountinfo(struct seq_file *m, void *v) -{ - struct proc_mounts *p = m->private; - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct super_block *sb = mnt->mnt_sb; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - struct path root = p->root; - int err = 0; - - seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, - MAJOR(sb->s_dev), MINOR(sb->s_dev)); - seq_dentry(m, mnt->mnt_root, " \t\n\\"); - seq_putc(m, ' '); - seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { - /* - * Mountpoint is outside root, discard that one. Ugly, - * but less so than trying to do that in iterator in a - * race-free way (due to renames). - */ - return SEQ_SKIP; - } - seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); - show_mnt_opts(m, mnt); - - /* Tagged fields ("foo:X" or "bar") */ - if (IS_MNT_SHARED(mnt)) - seq_printf(m, " shared:%i", mnt->mnt_group_id); - if (IS_MNT_SLAVE(mnt)) { - int master = mnt->mnt_master->mnt_group_id; - int dom = get_dominating_id(mnt, &p->root); - seq_printf(m, " master:%i", master); - if (dom && dom != master) - seq_printf(m, " propagate_from:%i", dom); - } - if (IS_MNT_UNBINDABLE(mnt)) - seq_puts(m, " unbindable"); - - /* Filesystem specific data */ - seq_puts(m, " - "); - show_type(m, sb); - seq_putc(m, ' '); - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); - err = show_sb_opts(m, sb); - if (err) - goto out; - if (sb->s_op->show_options) - err = sb->s_op->show_options(m, mnt); - seq_putc(m, '\n'); -out: - return err; -} - -const struct seq_operations mountinfo_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_mountinfo, -}; - -static int show_vfsstat(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; - - /* device */ - if (mnt->mnt_devname) { - seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); - } else - seq_puts(m, "no device"); - - /* mount point */ - seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - - /* file system type */ - seq_puts(m, "with fstype "); - show_type(m, mnt->mnt_sb); - - /* optional statistics */ - if (mnt->mnt_sb->s_op->show_stats) { - seq_putc(m, ' '); - err = mnt->mnt_sb->s_op->show_stats(m, mnt); - } - - seq_putc(m, '\n'); - return err; -} - -const struct seq_operations mountstats_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_vfsstat, + .show = m_show, }; #endif /* CONFIG_PROC_FS */ @@ -928,18 +1147,21 @@ const struct seq_operations mountstats_op = { * open files, pwds, chroots or sub mounts that are * busy. */ -int may_umount_tree(struct vfsmount *mnt) +int may_umount_tree(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; - struct vfsmount *p; + struct mount *p; + BUG_ON(!m); - spin_lock(&vfsmount_lock); + /* write lock needed for mnt_get_count */ + lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += atomic_read(&p->mnt_count); + actual_refs += mnt_get_count(p); minimum_refs += 2; } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; @@ -965,71 +1187,106 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; - spin_lock(&vfsmount_lock); - if (propagate_mount_busy(mnt, 2)) + down_read(&namespace_sem); + lock_mount_hash(); + if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); + up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) { - struct vfsmount *mnt; - while (!list_empty(head)) { - mnt = list_first_entry(head, struct vfsmount, mnt_hash); - list_del_init(&mnt->mnt_hash); - if (mnt->mnt_parent != mnt) { - struct dentry *dentry; - struct vfsmount *m; - spin_lock(&vfsmount_lock); - dentry = mnt->mnt_mountpoint; - m = mnt->mnt_parent; - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - m->mnt_ghosts--; - spin_unlock(&vfsmount_lock); - dput(dentry); - mntput(m); - } - mntput(mnt); + struct mount *mnt; + struct hlist_head head = unmounted; + + if (likely(hlist_empty(&head))) { + up_write(&namespace_sem); + return; + } + + head.first->pprev = &head.first; + INIT_HLIST_HEAD(&unmounted); + + up_write(&namespace_sem); + + synchronize_rcu(); + + while (!hlist_empty(&head)) { + mnt = hlist_entry(head.first, struct mount, mnt_hash); + hlist_del_init(&mnt->mnt_hash); + if (mnt->mnt_ex_mountpoint.mnt) + path_put(&mnt->mnt_ex_mountpoint); + mntput(&mnt->mnt); } } -void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +static inline void namespace_lock(void) { - struct vfsmount *p; + down_write(&namespace_sem); +} - for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, kill); +/* + * mount_lock must be held + * namespace_sem must be held for write + * how = 0 => just this tree, don't propagate + * how = 1 => propagate; we know that nobody else has reference to any victims + * how = 2 => lazy umount + */ +void umount_tree(struct mount *mnt, int how) +{ + HLIST_HEAD(tmp_list); + struct mount *p; + struct mount *last = NULL; - if (propagate) - propagate_umount(kill); + for (p = mnt; p; p = next_mnt(p, mnt)) { + hlist_del_init_rcu(&p->mnt_hash); + hlist_add_head(&p->mnt_hash, &tmp_list); + } + + if (how) + propagate_umount(&tmp_list); - list_for_each_entry(p, kill, mnt_hash) { + hlist_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + if (how < 2) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); - if (p->mnt_parent != p) { - p->mnt_parent->mnt_ghosts++; - p->mnt_mountpoint->d_mounted--; + if (mnt_has_parent(p)) { + put_mountpoint(p->mnt_mp); + /* move the reference to mountpoint into ->mnt_ex_mountpoint */ + p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; + p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; + p->mnt_mountpoint = p->mnt.mnt_root; + p->mnt_parent = p; + p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); + last = p; + } + if (last) { + last->mnt_hash.next = unmounted.first; + unmounted.first = tmp_list.first; + unmounted.first->pprev = &unmounted.first; } } -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); -static int do_umount(struct vfsmount *mnt, int flags) +static int do_umount(struct mount *mnt, int flags) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = mnt->mnt.mnt_sb; int retval; - LIST_HEAD(umount_list); - retval = security_sb_umount(mnt, flags); + retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; @@ -1040,12 +1297,20 @@ static int do_umount(struct vfsmount *mnt, int flags) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { - if (mnt == current->fs->root.mnt || + if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; - if (atomic_read(&mnt->mnt_count) != 2) + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + lock_mount_hash(); + if (mnt_get_count(mnt) != 2) { + unlock_mount_hash(); return -EBUSY; + } + unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -1074,7 +1339,7 @@ static int do_umount(struct vfsmount *mnt, int flags) * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ - if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. @@ -1086,27 +1351,36 @@ static int do_umount(struct vfsmount *mnt, int flags) return retval; } - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + namespace_lock(); + lock_mount_hash(); event++; - if (!(flags & MNT_DETACH)) - shrink_submounts(mnt, &umount_list); - - retval = -EBUSY; - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { + if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1, &umount_list); + umount_tree(mnt, 2); retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, 1); + retval = 0; + } } - spin_unlock(&vfsmount_lock); - if (retval) - security_sb_umount_busy(mnt); - up_write(&namespace_sem); - release_mounts(&umount_list); + unlock_mount_hash(); + namespace_unlock(); return retval; } +/* + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ + return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1118,26 +1392,36 @@ static int do_umount(struct vfsmount *mnt, int flags) SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { struct path path; + struct mount *mnt; int retval; + int lookup_flags = 0; + + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; - retval = user_path(name, &path); + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; + mnt = real_mount(path.mnt); retval = -EINVAL; if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(path.mnt)) + if (!check_mnt(mnt)) goto dput_and_out; - - retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto dput_and_out; - retval = do_umount(path.mnt, flags); + retval = do_umount(mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path.dentry); - mntput_no_expire(path.mnt); + mntput_no_expire(mnt); out: return retval; } @@ -1154,45 +1438,67 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) #endif -static int mount_is_safe(struct path *path) +static bool is_mnt_ns_file(struct dentry *dentry) { - if (capable(CAP_SYS_ADMIN)) - return 0; - return -EPERM; -#ifdef notyet - if (S_ISLNK(path->dentry->d_inode->i_mode)) - return -EPERM; - if (path->dentry->d_inode->i_mode & S_ISVTX) { - if (current_uid() != path->dentry->d_inode->i_uid) - return -EPERM; - } - if (inode_permission(path->dentry->d_inode, MAY_WRITE)) - return -EPERM; - return 0; -#endif + /* Is this a proxy for a mount namespace? */ + struct inode *inode = dentry->d_inode; + struct proc_ns *ei; + + if (!proc_ns_inode(inode)) + return false; + + ei = get_proc_ns(inode); + if (ei->ns_ops != &mntns_operations) + return false; + + return true; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = get_proc_ns(dentry->d_inode)->ns; + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct vfsmount *res, *p, *q, *r, *s; - struct path path; + struct mount *res, *p, *q, *r, *parent; + + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) - return NULL; + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) + return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); - if (!q) - goto Enomem; + if (IS_ERR(q)) + return q; + + q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + struct mount *s; if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } @@ -1201,52 +1507,67 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = q; - path.dentry = p->mnt_mountpoint; - q = clone_mnt(p, p->mnt_root, flag); - if (!q) - goto Enomem; - spin_lock(&vfsmount_lock); + parent = q; + q = clone_mnt(p, p->mnt.mnt_root, flag); + if (IS_ERR(q)) + goto out; + lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &path); - spin_unlock(&vfsmount_lock); + attach_mnt(q, parent, p->mnt_mp); + unlock_mount_hash(); } } return res; -Enomem: +out: if (res) { - LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); - umount_tree(res, 0, &umount_list); - spin_unlock(&vfsmount_lock); - release_mounts(&umount_list); + lock_mount_hash(); + umount_tree(res, 0); + unlock_mount_hash(); } - return NULL; + return q; } +/* Caller should check returned pointer for errors */ + struct vfsmount *collect_mounts(struct path *path) { - struct vfsmount *tree; - down_write(&namespace_sem); - tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); - up_write(&namespace_sem); - return tree; + struct mount *tree; + namespace_lock(); + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); + namespace_unlock(); + if (IS_ERR(tree)) + return ERR_CAST(tree); + return &tree->mnt; } void drop_collected_mounts(struct vfsmount *mnt) { - LIST_HEAD(umount_list); - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); + namespace_lock(); + lock_mount_hash(); + umount_tree(real_mount(mnt), 0); + unlock_mount_hash(); + namespace_unlock(); +} + +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct mount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); + if (res) + return res; + } + return 0; } -static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +static void cleanup_group_ids(struct mount *mnt, struct mount *end) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) @@ -1254,9 +1575,9 @@ static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) } } -static int invent_group_ids(struct vfsmount *mnt, bool recurse) +static int invent_group_ids(struct mount *mnt, bool recurse) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { @@ -1334,80 +1655,124 @@ static int invent_group_ids(struct vfsmount *mnt, bool recurse) * Must be called without spinlocks held, since this function can sleep * in allocations. */ -static int attach_recursive_mnt(struct vfsmount *source_mnt, - struct path *path, struct path *parent_path) -{ - LIST_HEAD(tree_list); - struct vfsmount *dest_mnt = path->mnt; - struct dentry *dest_dentry = path->dentry; - struct vfsmount *child, *p; +static int attach_recursive_mnt(struct mount *source_mnt, + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) +{ + HLIST_HEAD(tree_list); + struct mount *child, *p; + struct hlist_node *n; int err; if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; - } - err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); - if (err) - goto out_cleanup_ids; - - if (IS_MNT_SHARED(dest_mnt)) { + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); + if (err) + goto out_cleanup_ids; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); + } else { + lock_mount_hash(); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); - attach_mnt(source_mnt, path); - touch_mnt_namespace(parent_path->mnt->mnt_ns); + attach_mnt(source_mnt, dest_mnt, dest_mp); + touch_mnt_namespace(source_mnt->mnt_ns); } else { - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); - commit_tree(source_mnt); + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + commit_tree(source_mnt, NULL); } - list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { - list_del_init(&child->mnt_hash); - commit_tree(child); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; + hlist_del_init(&child->mnt_hash); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); + return 0; out_cleanup_ids: - if (IS_MNT_SHARED(dest_mnt)) - cleanup_group_ids(source_mnt, NULL); + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); + cleanup_group_ids(source_mnt, NULL); out: return err; } -static int graft_tree(struct vfsmount *mnt, struct path *path) +static struct mountpoint *lock_mount(struct path *path) { - int err; - if (mnt->mnt_sb->s_flags & MS_NOUSER) + struct vfsmount *mnt; + struct dentry *dentry = path->dentry; +retry: + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); + } + namespace_lock(); + mnt = lookup_mnt(path); + if (likely(!mnt)) { + struct mountpoint *mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); + path->mnt = mnt; + dentry = path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct mountpoint *where) +{ + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); +} + +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +{ + if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(path->dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt_root->d_inode->i_mode)) + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != + S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; - err = -ENOENT; - mutex_lock(&path->dentry->d_inode->i_mutex); - if (IS_DEADDIR(path->dentry->d_inode)) - goto out_unlock; + return attach_recursive_mnt(mnt, p, mp, NULL); +} - err = security_sb_check_sb(mnt, path); - if (err) - goto out_unlock; +/* + * Sanity check the flags to change_mnt_propagation. + */ - err = -ENOENT; - if (!d_unlinked(path->dentry)) - err = attach_recursive_mnt(mnt, path, NULL); -out_unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); - if (!err) - security_sb_post_addmount(mnt, path); - return err; +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~(MS_REC | MS_SILENT); + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; } /* @@ -1415,79 +1780,108 @@ out_unlock: */ static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = path->mnt; + struct mount *m; + struct mount *mnt = real_mount(path->mnt); int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (path->dentry != path->mnt->mnt_root) return -EINVAL; - down_write(&namespace_sem); + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + + namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } - spin_lock(&vfsmount_lock); + lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); out_unlock: - up_write(&namespace_sem); + namespace_unlock(); return err; } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /* * do loopback mount. */ -static int do_loopback(struct path *path, char *old_name, +static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; - struct vfsmount *mnt = NULL; - int err = mount_is_safe(path); - if (err) - return err; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; + int err; if (!old_name || !*old_name) return -EINVAL; - err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; - down_write(&namespace_sem); err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_path.mnt)) - goto out; + if (mnt_ns_loop(old_path.dentry)) + goto out; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; - err = -ENOMEM; + old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); + + err = -EINVAL; + if (IS_MNT_UNBINDABLE(old)) + goto out2; + + if (!check_mnt(parent) || !check_mnt(old)) + goto out2; + + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + if (recurse) - mnt = copy_tree(old_path.mnt, old_path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); + mnt = clone_mnt(old, old_path.dentry, 0); - if (!mnt) - goto out; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out2; + } - err = graft_tree(mnt, path); + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + err = graft_tree(mnt, parent, mp); if (err) { - LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); - release_mounts(&umount_list); + lock_mount_hash(); + umount_tree(mnt, 0); + unlock_mount_hash(); } - +out2: + unlock_mount(mp); out: - up_write(&namespace_sem); path_put(&old_path); return err; } @@ -1502,10 +1896,13 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + if (readonly_request) - error = mnt_make_readonly(mnt); + error = mnt_make_readonly(real_mount(mnt)); else - __mnt_unmake_readonly(mnt); + __mnt_unmake_readonly(real_mount(mnt)); return error; } @@ -1519,37 +1916,39 @@ static int do_remount(struct path *path, int flags, int mnt_flags, { int err; struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!check_mnt(path->mnt)) + if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != path->mnt->mnt_root) return -EINVAL; + err = security_sb_remount(sb, data); + if (err) + return err; + down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); + else if (!capable(CAP_SYS_ADMIN)) + err = -EPERM; else err = do_remount_sb(sb, flags, data, 0); - if (!err) - path->mnt->mnt_flags = mnt_flags; - up_write(&sb->s_umount); if (!err) { - security_sb_post_remount(path->mnt, flags, data); - - spin_lock(&vfsmount_lock); - touch_mnt_namespace(path->mnt->mnt_ns); - spin_unlock(&vfsmount_lock); + lock_mount_hash(); + mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt->mnt.mnt_flags = mnt_flags; + touch_mnt_namespace(mnt->mnt_ns); + unlock_mount_hash(); } + up_write(&sb->s_umount); return err; } -static inline int tree_contains_unbindable(struct vfsmount *mnt) +static inline int tree_contains_unbindable(struct mount *mnt) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; @@ -1557,40 +1956,39 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) return 0; } -static int do_move_mount(struct path *path, char *old_name) +static int do_move_mount(struct path *path, const char *old_name) { struct path old_path, parent_path; - struct vfsmount *p; - int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + struct mount *p; + struct mount *old; + struct mountpoint *mp; + int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; - down_write(&namespace_sem); - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; - err = -EINVAL; - if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; - err = -ENOENT; - mutex_lock(&path->dentry->d_inode->i_mutex); - if (IS_DEADDIR(path->dentry->d_inode)) + old = real_mount(old_path.mnt); + p = real_mount(path->mnt); + + err = -EINVAL; + if (!check_mnt(p) || !check_mnt(old)) goto out1; - if (d_unlinked(path->dentry)) + if (old->mnt.mnt_flags & MNT_LOCKED) goto out1; err = -EINVAL; if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_path.mnt == old_path.mnt->mnt_parent) + if (!mnt_has_parent(old)) goto out1; if (S_ISDIR(path->dentry->d_inode->i_mode) != @@ -1599,108 +1997,194 @@ static int do_move_mount(struct path *path, char *old_name) /* * Don't move a mount residing in a shared parent. */ - if (old_path.mnt->mnt_parent && - IS_MNT_SHARED(old_path.mnt->mnt_parent)) + if (IS_MNT_SHARED(old->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(path->mnt) && - tree_contains_unbindable(old_path.mnt)) + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out1; err = -ELOOP; - for (p = path->mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_path.mnt) + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) goto out1; - err = attach_recursive_mnt(old_path.mnt, path, &parent_path); + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_path.mnt->mnt_expire); + list_del_init(&old->mnt_expire); out1: - mutex_unlock(&path->dentry->d_inode->i_mutex); + unlock_mount(mp); out: - up_write(&namespace_sem); if (!err) path_put(&parent_path); path_put(&old_path); return err; } -/* - * create a new mount for userspace and request it to be added into the - * namespace's tree - */ -static int do_new_mount(struct path *path, char *type, int flags, - int mnt_flags, char *name, void *data) +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) { - struct vfsmount *mnt; - - if (!type) - return -EINVAL; - - /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ""; - lock_kernel(); - mnt = do_kern_mount(type, flags, name, data); - unlock_kernel(); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt->mnt_sb->s_subtype) + goto err; + return mnt; - return do_add_mount(mnt, path, mnt_flags, NULL); + err: + mntput(mnt); + return ERR_PTR(err); } /* * add a mount into a namespace's mount tree - * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct path *path, - int mnt_flags, struct list_head *fslist) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { + struct mountpoint *mp; + struct mount *parent; int err; - down_write(&namespace_sem); - /* Something was mounted here while we slept */ - while (d_mountpoint(path->dentry) && - follow_down(path)) - ; + mnt_flags &= ~MNT_INTERNAL_FLAGS; + + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + + parent = real_mount(path->mnt); err = -EINVAL; - if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) - goto unlock; + if (unlikely(!check_mnt(parent))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + goto unlock; + /* ... and for those we'd better have mountpoint still alive */ + if (!parent->mnt_ns) + goto unlock; + } /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (path->mnt->mnt_sb == newmnt->mnt_sb && + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) goto unlock; - newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, path))) - goto unlock; + newmnt->mnt.mnt_flags = mnt_flags; + err = graft_tree(newmnt, parent, mp); - if (fslist) /* add to the specified expiration list */ - list_add_tail(&newmnt->mnt_expire, fslist); +unlock: + unlock_mount(mp); + return err; +} - up_write(&namespace_sem); - return 0; +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, const char *fstype, int flags, + int mnt_flags, const char *name, void *data) +{ + struct file_system_type *type; + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct vfsmount *mnt; + int err; -unlock: - up_write(&namespace_sem); - mntput(newmnt); + if (!fstype) + return -EINVAL; + + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + err = do_add_mount(real_mount(mnt), path, mnt_flags); + if (err) + mntput(mnt); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + struct mount *mnt = real_mount(m); + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(mnt) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&mnt->mnt_expire)) { + namespace_lock(); + list_del_init(&mnt->mnt_expire); + namespace_unlock(); + } + mntput(m); + mntput(m); return err; } -EXPORT_SYMBOL_GPL(do_add_mount); +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + namespace_lock(); + + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); + + namespace_unlock(); +} +EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any @@ -1709,15 +2193,14 @@ EXPORT_SYMBOL_GPL(do_add_mount); */ void mark_mounts_for_expiry(struct list_head *mounts) { - struct vfsmount *mnt, *next; + struct mount *mnt, *next; LIST_HEAD(graveyard); - LIST_HEAD(umounts); if (list_empty(mounts)) return; - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + namespace_lock(); + lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -1732,14 +2215,12 @@ void mark_mounts_for_expiry(struct list_head *mounts) list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { - mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, &umounts); + umount_tree(mnt, 1); } - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - - release_mounts(&umounts); + unlock_mount_hash(); + namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -1750,9 +2231,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ -static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +static int select_submounts(struct mount *parent, struct list_head *graveyard) { - struct vfsmount *this_parent = parent; + struct mount *this_parent = parent; struct list_head *next; int found = 0; @@ -1761,10 +2242,10 @@ repeat: resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; - struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; - if (!(mnt->mnt_flags & MNT_SHRINKABLE)) + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. @@ -1793,19 +2274,21 @@ resume: /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint + * + * mount_lock must be held for write */ -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); - struct vfsmount *m; + struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { - m = list_first_entry(&graveyard, struct vfsmount, + m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1, umounts); + umount_tree(m, 1); } } } @@ -1902,8 +2385,8 @@ int copy_mount_string(const void __user *data, char **where) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -long do_mount(char *dev_name, char *dir_name, char *type_page, - unsigned long flags, void *data_page) +long do_mount(const char *dev_name, const char *dir_name, + const char *type_page, unsigned long flags, void *data_page) { struct path path; int retval = 0; @@ -1928,6 +2411,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = security_sb_mount(dev_name, &path, type_page, flags, data_page); + if (!retval && !may_mount()) + retval = -EPERM; if (retval) goto dput_out; @@ -1951,7 +2436,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); @@ -1972,72 +2457,109 @@ dput_out: return retval; } -static struct mnt_namespace *alloc_mnt_ns(void) +static void free_mnt_ns(struct mnt_namespace *ns) +{ + proc_free_inum(ns->proc_inum); + put_user_ns(ns->user_ns); + kfree(ns); +} + +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); return new_ns; } -/* - * Allocate a new namespace structure and populate it with contents - * copied from the namespace of the passed in task structure. - */ -static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; - struct vfsmount *p, *q; + struct mount *p, *q; + struct mount *old; + struct mount *new; + int copy_flags; + + BUG_ON(!ns); - new_ns = alloc_mnt_ns(); + if (likely(!(flags & CLONE_NEWNS))) { + get_mnt_ns(ns); + return ns; + } + + old = ns->root; + + new_ns = alloc_mnt_ns(user_ns); if (IS_ERR(new_ns)) return new_ns; - down_write(&namespace_sem); + namespace_lock(); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, - CL_COPY_ALL | CL_EXPIRE); - if (!new_ns->root) { - up_write(&namespace_sem); - kfree(new_ns); - return ERR_PTR(-ENOMEM); + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + if (user_ns != ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); + if (IS_ERR(new)) { + namespace_unlock(); + free_mnt_ns(new_ns); + return ERR_CAST(new); } - spin_lock(&vfsmount_lock); - list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&vfsmount_lock); + new_ns->root = new; + list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = mnt_ns->root; - q = new_ns->root; + p = old; + q = new; while (p) { q->mnt_ns = new_ns; - if (fs) { - if (p == fs->root.mnt) { - rootmnt = p; - fs->root.mnt = mntget(q); + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + new_fs->root.mnt = mntget(&q->mnt); + rootmnt = &p->mnt; } - if (p == fs->pwd.mnt) { - pwdmnt = p; - fs->pwd.mnt = mntget(q); + if (&p->mnt == new_fs->pwd.mnt) { + new_fs->pwd.mnt = mntget(&q->mnt); + pwdmnt = &p->mnt; } } - p = next_mnt(p, mnt_ns->root); - q = next_mnt(q, new_ns->root); + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); } - up_write(&namespace_sem); + namespace_unlock(); if (rootmnt) mntput(rootmnt); @@ -2047,47 +2569,60 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, return new_ns; } -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct fs_struct *new_fs) -{ - struct mnt_namespace *new_ns; - - BUG_ON(!ns); - get_mnt_ns(ns); - - if (!(flags & CLONE_NEWNS)) - return ns; - - new_ns = dup_mnt_ns(ns, new_fs); - - put_mnt_ns(ns); - return new_ns; -} - /** * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) { - struct mnt_namespace *new_ns; - - new_ns = alloc_mnt_ns(); + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); if (!IS_ERR(new_ns)) { + struct mount *mnt = real_mount(m); mnt->mnt_ns = new_ns; new_ns->root = mnt; - list_add(&new_ns->list, &new_ns->root->mnt_list); + list_add(&mnt->mnt_list, &new_ns->list); + } else { + mntput(m); } return new_ns; } -EXPORT_SYMBOL(create_mnt_ns); + +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; - char *kernel_dir; + struct filename *kernel_dir; char *kernel_dev; unsigned long data_page; @@ -2109,7 +2644,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (ret < 0) goto out_data; - ret = do_mount(kernel_dev, kernel_dir, kernel_type, flags, + ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, (void *) data_page); free_page(data_page); @@ -2124,6 +2659,31 @@ out_type: } /* + * Return true if path is reachable from root + * + * namespace_sem or mount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +int path_is_under(struct path *path1, struct path *path2) +{ + int res; + read_seqlock_excl(&mount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + read_sequnlock_excl(&mount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +/* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets @@ -2151,103 +2711,94 @@ out_type: SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct vfsmount *tmp; struct path new, old, parent_path, root_parent, root; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; int error; - if (!capable(CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; error = user_path_dir(new_root, &new); if (error) goto out0; - error = -EINVAL; - if (!check_mnt(new.mnt)) - goto out1; error = user_path_dir(put_old, &old); if (error) goto out1; error = security_sb_pivotroot(&old, &new); - if (error) { - path_put(&old); - goto out1; - } + if (error) + goto out2; + + get_fs_root(current->fs, &root); + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) + goto out3; - read_lock(¤t->fs->lock); - root = current->fs->root; - path_get(¤t->fs->root); - read_unlock(¤t->fs->lock); - down_write(&namespace_sem); - mutex_lock(&old.dentry->d_inode->i_mutex); error = -EINVAL; - if (IS_MNT_SHARED(old.mnt) || - IS_MNT_SHARED(new.mnt->mnt_parent) || - IS_MNT_SHARED(root.mnt->mnt_parent)) - goto out2; - if (!check_mnt(root.mnt)) - goto out2; + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || + IS_MNT_SHARED(new_mnt->mnt_parent) || + IS_MNT_SHARED(root_mnt->mnt_parent)) + goto out4; + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) + goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; error = -ENOENT; - if (IS_DEADDIR(new.dentry->d_inode)) - goto out2; if (d_unlinked(new.dentry)) - goto out2; - if (d_unlinked(old.dentry)) - goto out2; + goto out4; error = -EBUSY; - if (new.mnt == root.mnt || - old.mnt == root.mnt) - goto out2; /* loop, on the same file system */ + if (new_mnt == root_mnt || old_mnt == root_mnt) + goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) - goto out2; /* not a mountpoint */ - if (root.mnt->mnt_parent == root.mnt) - goto out2; /* not attached */ + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(root_mnt)) + goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; if (new.mnt->mnt_root != new.dentry) - goto out2; /* not a mountpoint */ - if (new.mnt->mnt_parent == new.mnt) - goto out2; /* not attached */ + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(new_mnt)) + goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old.mnt; - spin_lock(&vfsmount_lock); - if (tmp != new.mnt) { - for (;;) { - if (tmp->mnt_parent == tmp) - goto out3; /* already mounted on put_old */ - if (tmp->mnt_parent == new.mnt) - break; - tmp = tmp->mnt_parent; - } - if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) - goto out3; - } else if (!is_subdir(old.dentry, new.dentry)) - goto out3; - detach_mnt(new.mnt, &parent_path); - detach_mnt(root.mnt, &root_parent); + if (!is_path_reachable(old_mnt, old.dentry, &new)) + goto out4; + root_mp->m_count++; /* pin it so it won't go away */ + lock_mount_hash(); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } /* mount old root on put_old */ - attach_mnt(root.mnt, &old); + attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new.mnt, &root_parent); + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); chroot_fs_refs(&root, &new); - security_sb_post_pivotroot(&root, &new); + put_mountpoint(root_mp); error = 0; - path_put(&root_parent); - path_put(&parent_path); -out2: - mutex_unlock(&old.dentry->d_inode->i_mutex); - up_write(&namespace_sem); +out4: + unlock_mount(old_mp); + if (!error) { + path_put(&root_parent); + path_put(&parent_path); + } +out3: path_put(&root); +out2: path_put(&old); out1: path_put(&new); out0: return error; -out3: - spin_unlock(&vfsmount_lock); - goto out2; } static void __init init_mount_tree(void) @@ -2255,10 +2806,16 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; + struct file_system_type *type; - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); if (IS_ERR(mnt)) panic("Can't create rootfs"); + ns = create_mnt_ns(mnt); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); @@ -2266,8 +2823,8 @@ static void __init init_mount_tree(void) init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); - root.mnt = ns->root; - root.dentry = ns->root->mnt_root; + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -2278,20 +2835,29 @@ void __init mnt_init(void) unsigned u; int err; - init_rwsem(&namespace_sem); - - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - - if (!mount_hashtable) + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct hlist_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + 0, + &mp_hash_shift, &mp_hash_mask, 0, 0); + + if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); + for (u = 0; u <= m_hash_mask; u++) + INIT_HLIST_HEAD(&mount_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - for (u = 0; u < HASH_SIZE; u++) - INIT_LIST_HEAD(&mount_hashtable[u]); + kernfs_init(); err = sysfs_init(); if (err) @@ -2306,20 +2872,166 @@ void __init mnt_init(void) void put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root; - LIST_HEAD(umount_list); - - if (!atomic_dec_and_lock(&ns->count, &vfsmount_lock)) + if (!atomic_dec_and_test(&ns->count)) return; - root = ns->root; - ns->root = NULL; - spin_unlock(&vfsmount_lock); - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(root, 0, &umount_list); - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); - kfree(ns); + drop_collected_mounts(&ns->root->mnt); + free_mnt_ns(ns); +} + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + } + return mnt; +} +EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + real_mount(mnt)->mnt_ns = NULL; + synchronize_rcu(); /* yecchhh... */ + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(real_mount(mnt)); +} + +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; } -EXPORT_SYMBOL(put_mnt_ns); + +bool fs_fully_visible(struct file_system_type *type) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool visible = false; + + if (unlikely(!ns)) + return false; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink > 2) + goto next; + } + visible = true; + goto found; + next: ; + } +found: + up_read(&namespace_sem); + return visible; +} + +static void *mntns_get(struct task_struct *task) +{ + struct mnt_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->mnt_ns; + get_mnt_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void mntns_put(void *ns) +{ + put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = ns; + struct path root; + + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, + .inum = mntns_inum, +}; |
