diff options
Diffstat (limited to 'fs/namespace.c')
| -rw-r--r-- | fs/namespace.c | 2883 |
1 files changed, 2033 insertions, 850 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 94f026ec990..182bc41cd88 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -9,58 +9,215 @@ */ #include <linux/syscalls.h> -#include <linux/slab.h> -#include <linux/sched.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/quotaops.h> -#include <linux/acct.h> +#include <linux/export.h> #include <linux/capability.h> -#include <linux/module.h> -#include <linux/sysfs.h> -#include <linux/seq_file.h> #include <linux/mnt_namespace.h> +#include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> -#include <linux/mount.h> -#include <linux/ramfs.h> -#include <linux/log2.h> -#include <asm/uaccess.h> -#include <asm/unistd.h> +#include <linux/idr.h> +#include <linux/acct.h> /* acct_auto_close_mnt */ +#include <linux/init.h> /* init_rootfs */ +#include <linux/fs_struct.h> /* get_fs_root et.al. */ +#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ +#include <linux/uaccess.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> +#include <linux/bootmem.h> #include "pnode.h" #include "internal.h" -#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) -#define HASH_SIZE (1UL << HASH_SHIFT) +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; -/* spinlock for vfsmount related operations, inplace of dcache_lock */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); -static int event; +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); + +static u64 event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); +static int mnt_id_start = 0; +static int mnt_group_start = 1; -static struct list_head *mount_hashtable __read_mostly; +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; -static struct rw_semaphore namespace_sem; +static DECLARE_RWSEM(namespace_sem); /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); -static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); + +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); - tmp = tmp + (tmp >> HASH_SHIFT); - return tmp & (HASH_SIZE - 1); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; +} + +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; +} + +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialize with freeing. + */ +static int mnt_alloc_id(struct mount *mnt) +{ + int res; + +retry: + ida_pre_get(&mnt_id_ida, GFP_KERNEL); + spin_lock(&mnt_id_lock); + res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); + if (!res) + mnt_id_start = mnt->mnt_id + 1; + spin_unlock(&mnt_id_lock); + if (res == -EAGAIN) + goto retry; + + return res; +} + +static void mnt_free_id(struct mount *mnt) +{ + int id = mnt->mnt_id; + spin_lock(&mnt_id_lock); + ida_remove(&mnt_id_ida, id); + if (mnt_id_start > id) + mnt_id_start = id; + spin_unlock(&mnt_id_lock); +} + +/* + * Allocate a new peer group ID + * + * mnt_group_ida is protected by namespace_sem + */ +static int mnt_alloc_group_id(struct mount *mnt) +{ + int res; + + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) + return -ENOMEM; + + res = ida_get_new_above(&mnt_group_ida, + mnt_group_start, + &mnt->mnt_group_id); + if (!res) + mnt_group_start = mnt->mnt_group_id + 1; + + return res; +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct mount *mnt) +{ + int id = mnt->mnt_group_id; + ida_remove(&mnt_group_ida, id); + if (mnt_group_start > id) + mnt_group_start = id; + mnt->mnt_group_id = 0; +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct mount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +/* + * vfsmount lock must be held for write + */ +unsigned int mnt_get_count(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif } -struct vfsmount *alloc_vfsmnt(const char *name) +static struct mount *alloc_vfsmnt(const char *name) { - struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { - atomic_set(&mnt->mnt_count, 1); - INIT_LIST_HEAD(&mnt->mnt_hash); + int err; + + err = mnt_alloc_id(mnt); + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; + } + +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + + INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); @@ -68,77 +225,500 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); - if (name) { - int size = strlen(name) + 1; - char *newname = kmalloc(size, GFP_KERNEL); - if (newname) { - memcpy(newname, name, size); - mnt->mnt_devname = newname; - } - } +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); +#endif } return mnt; + +#ifdef CONFIG_SMP +out_free_devname: + kfree(mnt->mnt_devname); +#endif +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +static inline void mnt_inc_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_inc(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers++; +#endif +} + +static inline void mnt_dec_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_dec(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers--; +#endif +} + +static unsigned int mnt_get_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; + } + + return count; +#else + return mnt->mnt_writers; +#endif } -int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +static int mnt_is_readonly(struct vfsmount *mnt) { - mnt->mnt_sb = sb; - mnt->mnt_root = dget(sb->s_root); + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + +/* + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. + */ +/** + * __mnt_want_write - get write access to a mount without freeze protection + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. + */ +int __mnt_want_write(struct vfsmount *m) +{ + struct mount *mnt = real_mount(m); + int ret = 0; + + preempt_disable(); + mnt_inc_writers(mnt); + /* + * The store to mnt_inc_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); + if (mnt_is_readonly(m)) { + mnt_dec_writers(mnt); + ret = -EROFS; + } + preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +/** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + mnt_inc_writers(real_mount(mnt)); + preempt_enable(); return 0; } +EXPORT_SYMBOL_GPL(mnt_clone_write); + +/** + * __mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like __mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int __mnt_want_write_file(struct file *file) +{ + if (!(file->f_mode & FMODE_WRITER)) + return __mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write_file); + +/** + * __mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * __mnt_want_write() call above. + */ +void __mnt_drop_write(struct vfsmount *mnt) +{ + preempt_disable(); + mnt_dec_writers(real_mount(mnt)); + preempt_enable(); +} -EXPORT_SYMBOL(simple_set_mnt); +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + +void mnt_drop_write_file(struct file *file) +{ + mnt_drop_write(file->f_path.mnt); +} +EXPORT_SYMBOL(mnt_drop_write_file); -void free_vfsmnt(struct vfsmount *mnt) +static int mnt_make_readonly(struct mount *mnt) +{ + int ret = 0; + + lock_mount_hash(); + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + /* + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. + */ + smp_mb(); + + /* + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. + */ + if (mnt_get_writers(mnt) > 0) + ret = -EBUSY; + else + mnt->mnt.mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + unlock_mount_hash(); + return ret; +} + +static void __mnt_unmake_readonly(struct mount *mnt) +{ + lock_mount_hash(); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + unlock_mount_hash(); +} + +int sb_prepare_remount_readonly(struct super_block *sb) +{ + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } + unlock_mount_hash(); + + return err; +} + +static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_pcp); +#endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return false; + if (bastard == NULL) + return true; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + if (likely(!read_seqretry(&mount_lock, seq))) + return true; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return false; + } + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + return false; +} + /* - * find the first or last mount at @dentry on vfsmount @mnt depending on - * @dir. If @dir is set return the first mount else return the last mount. + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock() */ -struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, - int dir) +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { - struct list_head *head = mount_hashtable + hash(mnt, dentry); - struct list_head *tmp = head; - struct vfsmount *p, *found = NULL; + struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; - for (;;) { - tmp = dir ? tmp->next : tmp->prev; - p = NULL; - if (tmp == head) - break; - p = list_entry(tmp, struct vfsmount, mnt_hash); - if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) { - found = p; + hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +} + +/* + * find the last mount at @dentry on vfsmount @mnt. + * mount_lock must be held. + */ +struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) +{ + struct mount *p, *res; + res = p = __lookup_mnt(mnt, dentry); + if (!p) + goto out; + hlist_for_each_entry_continue(p, mnt_hash) { + if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - } + res = p; } - return found; +out: + return res; } /* - * lookup_mnt increments the ref count before returning - * the vfsmount struct. + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically. If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount. */ -struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) +struct vfsmount *lookup_mnt(struct path *path) { - struct vfsmount *child_mnt; - spin_lock(&vfsmount_lock); - if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) - mntget(child_mnt); - spin_unlock(&vfsmount_lock); - return child_mnt; + struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret; + + hlist_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); + mp->m_count++; + return mp; + } + } + + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return ERR_PTR(-ENOMEM); + + ret = d_set_mounted(dentry); + if (ret) { + kfree(mp); + return ERR_PTR(ret); + } + + mp->m_dentry = dentry; + mp->m_count = 1; + hlist_add_head(&mp->m_hash, chain); + return mp; +} + +static void put_mountpoint(struct mountpoint *mp) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + hlist_del(&mp->m_hash); + kfree(mp); + } } -static inline int check_mnt(struct vfsmount *mnt) +static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } +/* + * vfsmount lock must be held for write + */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { @@ -147,6 +727,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns) } } +/* + * vfsmount lock must be held for write + */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { @@ -155,40 +738,54 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } } -static void detach_mnt(struct vfsmount *mnt, struct path *old_path) +/* + * vfsmount lock must be held for write + */ +static void detach_mnt(struct mount *mnt, struct path *old_path) { old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = mnt->mnt_parent; + old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; - mnt->mnt_mountpoint = mnt->mnt_root; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); - old_path->dentry->d_mounted--; + hlist_del_init_rcu(&mnt->mnt_hash); + put_mountpoint(mnt->mnt_mp); + mnt->mnt_mp = NULL; } -void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, - struct vfsmount *child_mnt) +/* + * vfsmount lock must be held for write + */ +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, + struct mount *child_mnt) { - child_mnt->mnt_parent = mntget(mnt); - child_mnt->mnt_mountpoint = dget(dentry); - dentry->d_mounted++; + mp->m_count++; + mnt_add_count(mnt, 1); /* essentially, that's mntget */ + child_mnt->mnt_mountpoint = dget(mp->m_dentry); + child_mnt->mnt_parent = mnt; + child_mnt->mnt_mp = mp; } -static void attach_mnt(struct vfsmount *mnt, struct path *path) +/* + * vfsmount lock must be held for write + */ +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) { - mnt_set_mountpoint(path->mnt, path->dentry, mnt); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(path->mnt, path->dentry)); - list_add_tail(&mnt->mnt_child, &path->mnt->mnt_mounts); + mnt_set_mountpoint(parent, mp, mnt); + hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* - * the caller must hold vfsmount_lock + * vfsmount lock must be held for write */ -static void commit_tree(struct vfsmount *mnt) +static void commit_tree(struct mount *mnt, struct mount *shadows) { - struct vfsmount *parent = mnt->mnt_parent; - struct vfsmount *m; + struct mount *parent = mnt->mnt_parent; + struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; @@ -197,15 +794,19 @@ static void commit_tree(struct vfsmount *mnt) list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) m->mnt_ns = n; + list_splice(&head, n->list.prev); - list_add_tail(&mnt->mnt_hash, mount_hashtable + - hash(parent, mnt->mnt_mountpoint)); + if (shadows) + hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + else + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); touch_mnt_namespace(n); } -static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) +static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { @@ -218,104 +819,213 @@ static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root) p = p->mnt_parent; } } - return list_entry(next, struct vfsmount, mnt_child); + return list_entry(next, struct mount, mnt_child); } -static struct vfsmount *skip_mnt_tree(struct vfsmount *p) +static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { - p = list_entry(prev, struct vfsmount, mnt_child); + p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } -static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, - int flag) +struct vfsmount * +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { - struct super_block *sb = old->mnt_sb; - struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); + struct mount *mnt; + struct dentry *root; - if (mnt) { - mnt->mnt_flags = old->mnt_flags; - atomic_inc(&sb->s_active); - mnt->mnt_sb = sb; - mnt->mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - - if (flag & CL_SLAVE) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); - mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); - mnt->mnt_master = old->mnt_master; - } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); - } + if (!type) + return ERR_PTR(-ENODEV); + + mnt = alloc_vfsmnt(name); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flags & MS_KERNMOUNT) + mnt->mnt.mnt_flags = MNT_INTERNAL; + + root = mount_fs(type, flags, name, data); + if (IS_ERR(root)) { + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_CAST(root); } - return mnt; + + mnt->mnt.mnt_root = root; + mnt->mnt.mnt_sb = root->d_sb; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + unlock_mount_hash(); + return &mnt->mnt; } +EXPORT_SYMBOL_GPL(vfs_kern_mount); -static inline void __mntput(struct vfsmount *mnt) +static struct mount *clone_mnt(struct mount *old, struct dentry *root, + int flag) { - struct super_block *sb = mnt->mnt_sb; - dput(mnt->mnt_root); + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt; + int err; + + mnt = alloc_vfsmnt(old->mnt_devname); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); + /* Don't allow unprivileged users to change mount flags */ + if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + /* Don't allow unprivileged users to reveal what is under a mount */ + if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + mnt->mnt.mnt_flags |= MNT_LOCKED; + + atomic_inc(&sb->s_active); + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + unlock_mount_hash(); + + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); + } + + return mnt; + + out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); - deactivate_super(sb); + return ERR_PTR(err); } -void mntput_no_expire(struct vfsmount *mnt) +static void mntput_no_expire(struct mount *mnt) { -repeat: - if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { - if (likely(!mnt->mnt_pinned)) { - spin_unlock(&vfsmount_lock); - __mntput(mnt); - return; - } - atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); +put_again: + rcu_read_lock(); + mnt_add_count(mnt, -1); + if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ + rcu_read_unlock(); + return; + } + lock_mount_hash(); + if (mnt_get_count(mnt)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + if (unlikely(mnt->mnt_pinned)) { + mnt_add_count(mnt, mnt->mnt_pinned + 1); mnt->mnt_pinned = 0; - spin_unlock(&vfsmount_lock); - acct_auto_close_mnt(mnt); - security_sb_umount_close(mnt); - goto repeat; + rcu_read_unlock(); + unlock_mount_hash(); + acct_auto_close_mnt(&mnt->mnt); + goto put_again; + } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); + + list_del(&mnt->mnt_instance); + unlock_mount_hash(); + + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + struct mount *m = real_mount(mnt); + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); } } +EXPORT_SYMBOL(mntput); -EXPORT_SYMBOL(mntput_no_expire); +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_add_count(real_mount(mnt), 1); + return mnt; +} +EXPORT_SYMBOL(mntget); void mnt_pin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); - mnt->mnt_pinned++; - spin_unlock(&vfsmount_lock); + lock_mount_hash(); + real_mount(mnt)->mnt_pinned++; + unlock_mount_hash(); } - EXPORT_SYMBOL(mnt_pin); -void mnt_unpin(struct vfsmount *mnt) +void mnt_unpin(struct vfsmount *m) { - spin_lock(&vfsmount_lock); + struct mount *mnt = real_mount(m); + lock_mount_hash(); if (mnt->mnt_pinned) { - atomic_inc(&mnt->mnt_count); + mnt_add_count(mnt, 1); mnt->mnt_pinned--; } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); } - EXPORT_SYMBOL(mnt_unpin); static inline void mangle(struct seq_file *m, const char *s) @@ -329,14 +1039,18 @@ static inline void mangle(struct seq_file *m, const char *s) * * See also save_mount_options(). */ -int generic_show_options(struct seq_file *m, struct vfsmount *mnt) +int generic_show_options(struct seq_file *m, struct dentry *root) { - const char *options = mnt->mnt_sb->s_options; + const char *options; + + rcu_read_lock(); + options = rcu_dereference(root->d_sb->s_options); if (options != NULL && options[0]) { seq_putc(m, ','); mangle(m, options); } + rcu_read_unlock(); return 0; } @@ -357,25 +1071,52 @@ EXPORT_SYMBOL(generic_show_options); */ void save_mount_options(struct super_block *sb, char *options) { - kfree(sb->s_options); - sb->s_options = kstrdup(options, GFP_KERNEL); + BUG_ON(sb->s_options); + rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL)); } EXPORT_SYMBOL(save_mount_options); -/* iterator */ +void replace_mount_options(struct super_block *sb, char *options) +{ + char *old = sb->s_options; + rcu_assign_pointer(sb->s_options, options); + if (old) { + synchronize_rcu(); + kfree(old); + } +} +EXPORT_SYMBOL(replace_mount_options); + +#ifdef CONFIG_PROC_FS +/* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&n->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &n->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) @@ -383,100 +1124,20 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -static int show_vfsmnt(struct seq_file *m, void *v) +static int m_show(struct seq_file *m, void *v) { - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - static struct proc_fs_info { - int flag; - char *str; - } fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { 0, NULL } - }; - static struct proc_fs_info mnt_info[] = { - { MNT_NOSUID, ",nosuid" }, - { MNT_NODEV, ",nodev" }, - { MNT_NOEXEC, ",noexec" }, - { MNT_NOATIME, ",noatime" }, - { MNT_NODIRATIME, ",nodiratime" }, - { MNT_RELATIME, ",relatime" }, - { 0, NULL } - }; - struct proc_fs_info *fs_infop; - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - - mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); - seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - mangle(m, mnt->mnt_sb->s_type->name); - if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, mnt->mnt_sb->s_subtype); - } - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - if (mnt->mnt_sb->s_op->show_options) - err = mnt->mnt_sb->s_op->show_options(m, mnt); - seq_puts(m, " 0 0\n"); - return err; + struct proc_mounts *p = proc_mounts(m); + struct mount *r = list_entry(v, struct mount, mnt_list); + return p->show(m, &r->mnt); } -struct seq_operations mounts_op = { +const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_vfsmnt -}; - -static int show_vfsstat(struct seq_file *m, void *v) -{ - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; - - /* device */ - if (mnt->mnt_devname) { - seq_puts(m, "device "); - mangle(m, mnt->mnt_devname); - } else - seq_puts(m, "no device"); - - /* mount point */ - seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); - seq_putc(m, ' '); - - /* file system type */ - seq_puts(m, "with fstype "); - mangle(m, mnt->mnt_sb->s_type->name); - - /* optional statistics */ - if (mnt->mnt_sb->s_op->show_stats) { - seq_putc(m, ' '); - err = mnt->mnt_sb->s_op->show_stats(m, mnt); - } - - seq_putc(m, '\n'); - return err; -} - -struct seq_operations mountstats_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_vfsstat, + .show = m_show, }; +#endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy @@ -486,18 +1147,21 @@ struct seq_operations mountstats_op = { * open files, pwds, chroots or sub mounts that are * busy. */ -int may_umount_tree(struct vfsmount *mnt) +int may_umount_tree(struct vfsmount *m) { + struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; - struct vfsmount *p; + struct mount *p; + BUG_ON(!m); - spin_lock(&vfsmount_lock); + /* write lock needed for mnt_get_count */ + lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += atomic_read(&p->mnt_count); + actual_refs += mnt_get_count(p); minimum_refs += 2; } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; @@ -523,71 +1187,106 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; - spin_lock(&vfsmount_lock); - if (propagate_mount_busy(mnt, 2)) + down_read(&namespace_sem); + lock_mount_hash(); + if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); + up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); -void release_mounts(struct list_head *head) +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ + +static void namespace_unlock(void) { - struct vfsmount *mnt; - while (!list_empty(head)) { - mnt = list_first_entry(head, struct vfsmount, mnt_hash); - list_del_init(&mnt->mnt_hash); - if (mnt->mnt_parent != mnt) { - struct dentry *dentry; - struct vfsmount *m; - spin_lock(&vfsmount_lock); - dentry = mnt->mnt_mountpoint; - m = mnt->mnt_parent; - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - m->mnt_ghosts--; - spin_unlock(&vfsmount_lock); - dput(dentry); - mntput(m); - } - mntput(mnt); + struct mount *mnt; + struct hlist_head head = unmounted; + + if (likely(hlist_empty(&head))) { + up_write(&namespace_sem); + return; + } + + head.first->pprev = &head.first; + INIT_HLIST_HEAD(&unmounted); + + up_write(&namespace_sem); + + synchronize_rcu(); + + while (!hlist_empty(&head)) { + mnt = hlist_entry(head.first, struct mount, mnt_hash); + hlist_del_init(&mnt->mnt_hash); + if (mnt->mnt_ex_mountpoint.mnt) + path_put(&mnt->mnt_ex_mountpoint); + mntput(&mnt->mnt); } } -void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + +/* + * mount_lock must be held + * namespace_sem must be held for write + * how = 0 => just this tree, don't propagate + * how = 1 => propagate; we know that nobody else has reference to any victims + * how = 2 => lazy umount + */ +void umount_tree(struct mount *mnt, int how) { - struct vfsmount *p; + HLIST_HEAD(tmp_list); + struct mount *p; + struct mount *last = NULL; - for (p = mnt; p; p = next_mnt(p, mnt)) - list_move(&p->mnt_hash, kill); + for (p = mnt; p; p = next_mnt(p, mnt)) { + hlist_del_init_rcu(&p->mnt_hash); + hlist_add_head(&p->mnt_hash, &tmp_list); + } - if (propagate) - propagate_umount(kill); + if (how) + propagate_umount(&tmp_list); - list_for_each_entry(p, kill, mnt_hash) { + hlist_for_each_entry(p, &tmp_list, mnt_hash) { list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; + if (how < 2) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; list_del_init(&p->mnt_child); - if (p->mnt_parent != p) { - p->mnt_parent->mnt_ghosts++; - p->mnt_mountpoint->d_mounted--; + if (mnt_has_parent(p)) { + put_mountpoint(p->mnt_mp); + /* move the reference to mountpoint into ->mnt_ex_mountpoint */ + p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; + p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; + p->mnt_mountpoint = p->mnt.mnt_root; + p->mnt_parent = p; + p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); + last = p; + } + if (last) { + last->mnt_hash.next = unmounted.first; + unmounted.first = tmp_list.first; + unmounted.first->pprev = &unmounted.first; } } -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts); +static void shrink_submounts(struct mount *mnt); -static int do_umount(struct vfsmount *mnt, int flags) +static int do_umount(struct mount *mnt, int flags) { - struct super_block *sb = mnt->mnt_sb; + struct super_block *sb = mnt->mnt.mnt_sb; int retval; - LIST_HEAD(umount_list); - retval = security_sb_umount(mnt, flags); + retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; @@ -598,12 +1297,20 @@ static int do_umount(struct vfsmount *mnt, int flags) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { - if (mnt == current->fs->root.mnt || + if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; - if (atomic_read(&mnt->mnt_count) != 2) + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + lock_mount_hash(); + if (mnt_get_count(mnt) != 2) { + unlock_mount_hash(); return -EBUSY; + } + unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; @@ -619,10 +1326,9 @@ static int do_umount(struct vfsmount *mnt, int flags) * about for the moment. */ - lock_kernel(); - if (sb->s_op->umount_begin) - sb->s_op->umount_begin(mnt, flags); - unlock_kernel(); + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + sb->s_op->umount_begin(sb); + } /* * No sense to grab the lock for this test, but test itself looks @@ -633,43 +1339,48 @@ static int do_umount(struct vfsmount *mnt, int flags) * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ - if (mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ down_write(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY)) { - lock_kernel(); - DQUOT_OFF(sb); + if (!(sb->s_flags & MS_RDONLY)) retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); - unlock_kernel(); - } up_write(&sb->s_umount); return retval; } - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + namespace_lock(); + lock_mount_hash(); event++; - if (!(flags & MNT_DETACH)) - shrink_submounts(mnt, &umount_list); - - retval = -EBUSY; - if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) { + if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1, &umount_list); + umount_tree(mnt, 2); retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, 1); + retval = 0; + } } - spin_unlock(&vfsmount_lock); - if (retval) - security_sb_umount_busy(mnt); - up_write(&namespace_sem); - release_mounts(&umount_list); + unlock_mount_hash(); + namespace_unlock(); return retval; } +/* + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ + return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -678,29 +1389,39 @@ static int do_umount(struct vfsmount *mnt, int flags) * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD */ -asmlinkage long sys_umount(char __user * name, int flags) +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { - struct nameidata nd; + struct path path; + struct mount *mnt; int retval; + int lookup_flags = 0; + + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; - retval = __user_walk(name, LOOKUP_FOLLOW, &nd); + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; + mnt = real_mount(path.mnt); retval = -EINVAL; - if (nd.path.dentry != nd.path.mnt->mnt_root) + if (path.dentry != path.mnt->mnt_root) goto dput_and_out; - if (!check_mnt(nd.path.mnt)) + if (!check_mnt(mnt)) goto dput_and_out; - - retval = -EPERM; - if (!capable(CAP_SYS_ADMIN)) + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto dput_and_out; - retval = do_umount(nd.path.mnt, flags); + retval = do_umount(mnt, flags); dput_and_out: /* we mustn't call path_put() as that would clear mnt_expiry_mark */ - dput(nd.path.dentry); - mntput_no_expire(nd.path.mnt); + dput(path.dentry); + mntput_no_expire(mnt); out: return retval; } @@ -710,63 +1431,74 @@ out: /* * The 2.0 compatible umount. No flags. */ -asmlinkage long sys_oldumount(char __user * name) +SYSCALL_DEFINE1(oldumount, char __user *, name) { return sys_umount(name, 0); } #endif -static int mount_is_safe(struct nameidata *nd) +static bool is_mnt_ns_file(struct dentry *dentry) { - if (capable(CAP_SYS_ADMIN)) - return 0; - return -EPERM; -#ifdef notyet - if (S_ISLNK(nd->path.dentry->d_inode->i_mode)) - return -EPERM; - if (nd->path.dentry->d_inode->i_mode & S_ISVTX) { - if (current->uid != nd->path.dentry->d_inode->i_uid) - return -EPERM; - } - if (vfs_permission(nd, MAY_WRITE)) - return -EPERM; - return 0; -#endif + /* Is this a proxy for a mount namespace? */ + struct inode *inode = dentry->d_inode; + struct proc_ns *ei; + + if (!proc_ns_inode(inode)) + return false; + + ei = get_proc_ns(inode); + if (ei->ns_ops != &mntns_operations) + return false; + + return true; } -static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) +static bool mnt_ns_loop(struct dentry *dentry) { - while (1) { - if (d == dentry) - return 1; - if (d == NULL || d == d->d_parent) - return 0; - d = d->d_parent; - } + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = get_proc_ns(dentry->d_inode)->ns; + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { - struct vfsmount *res, *p, *q, *r, *s; - struct path path; + struct mount *res, *p, *q, *r, *parent; + + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) - return NULL; + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) + return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); - if (!q) - goto Enomem; + if (IS_ERR(q)) + return q; + + q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { - if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) + struct mount *s; + if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { - if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(s)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + s = skip_mnt_tree(s); + continue; + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } @@ -775,47 +1507,89 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, q = q->mnt_parent; } p = s; - path.mnt = q; - path.dentry = p->mnt_mountpoint; - q = clone_mnt(p, p->mnt_root, flag); - if (!q) - goto Enomem; - spin_lock(&vfsmount_lock); + parent = q; + q = clone_mnt(p, p->mnt.mnt_root, flag); + if (IS_ERR(q)) + goto out; + lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, &path); - spin_unlock(&vfsmount_lock); + attach_mnt(q, parent, p->mnt_mp); + unlock_mount_hash(); } } return res; -Enomem: +out: if (res) { - LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); - umount_tree(res, 0, &umount_list); - spin_unlock(&vfsmount_lock); - release_mounts(&umount_list); + lock_mount_hash(); + umount_tree(res, 0); + unlock_mount_hash(); } - return NULL; + return q; } -struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) +/* Caller should check returned pointer for errors */ + +struct vfsmount *collect_mounts(struct path *path) { - struct vfsmount *tree; - down_read(&namespace_sem); - tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); - up_read(&namespace_sem); - return tree; + struct mount *tree; + namespace_lock(); + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); + namespace_unlock(); + if (IS_ERR(tree)) + return ERR_CAST(tree); + return &tree->mnt; } void drop_collected_mounts(struct vfsmount *mnt) { - LIST_HEAD(umount_list); - down_read(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); - up_read(&namespace_sem); - release_mounts(&umount_list); + namespace_lock(); + lock_mount_hash(); + umount_tree(real_mount(mnt), 0); + unlock_mount_hash(); + namespace_unlock(); +} + +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct mount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); + if (res) + return res; + } + return 0; +} + +static void cleanup_group_ids(struct mount *mnt, struct mount *end) +{ + struct mount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct mount *mnt, bool recurse) +{ + struct mount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; } /* @@ -881,178 +1655,300 @@ void drop_collected_mounts(struct vfsmount *mnt) * Must be called without spinlocks held, since this function can sleep * in allocations. */ -static int attach_recursive_mnt(struct vfsmount *source_mnt, - struct path *path, struct path *parent_path) -{ - LIST_HEAD(tree_list); - struct vfsmount *dest_mnt = path->mnt; - struct dentry *dest_dentry = path->dentry; - struct vfsmount *child, *p; - - if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) - return -EINVAL; +static int attach_recursive_mnt(struct mount *source_mnt, + struct mount *dest_mnt, + struct mountpoint *dest_mp, + struct path *parent_path) +{ + HLIST_HEAD(tree_list); + struct mount *child, *p; + struct hlist_node *n; + int err; if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); + if (err) + goto out_cleanup_ids; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); + } else { + lock_mount_hash(); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); - attach_mnt(source_mnt, path); - touch_mnt_namespace(current->nsproxy->mnt_ns); + attach_mnt(source_mnt, dest_mnt, dest_mp); + touch_mnt_namespace(source_mnt->mnt_ns); } else { - mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); - commit_tree(source_mnt); + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + commit_tree(source_mnt, NULL); } - list_for_each_entry_safe(child, p, &tree_list, mnt_hash) { - list_del_init(&child->mnt_hash); - commit_tree(child); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; + hlist_del_init(&child->mnt_hash); + q = __lookup_mnt_last(&child->mnt_parent->mnt, + child->mnt_mountpoint); + commit_tree(child, q); } - spin_unlock(&vfsmount_lock); + unlock_mount_hash(); + return 0; + + out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); + cleanup_group_ids(source_mnt, NULL); + out: + return err; } -static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) +static struct mountpoint *lock_mount(struct path *path) { - int err; - if (mnt->mnt_sb->s_flags & MS_NOUSER) + struct vfsmount *mnt; + struct dentry *dentry = path->dentry; +retry: + mutex_lock(&dentry->d_inode->i_mutex); + if (unlikely(cant_mount(dentry))) { + mutex_unlock(&dentry->d_inode->i_mutex); + return ERR_PTR(-ENOENT); + } + namespace_lock(); + mnt = lookup_mnt(path); + if (likely(!mnt)) { + struct mountpoint *mp = new_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); + return mp; + } + return mp; + } + namespace_unlock(); + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); + path->mnt = mnt; + dentry = path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct mountpoint *where) +{ + struct dentry *dentry = where->m_dentry; + put_mountpoint(where); + namespace_unlock(); + mutex_unlock(&dentry->d_inode->i_mutex); +} + +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +{ + if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt_root->d_inode->i_mode)) + if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != + S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) return -ENOTDIR; - err = -ENOENT; - mutex_lock(&nd->path.dentry->d_inode->i_mutex); - if (IS_DEADDIR(nd->path.dentry->d_inode)) - goto out_unlock; + return attach_recursive_mnt(mnt, p, mp, NULL); +} - err = security_sb_check_sb(mnt, nd); - if (err) - goto out_unlock; +/* + * Sanity check the flags to change_mnt_propagation. + */ - err = -ENOENT; - if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry)) - err = attach_recursive_mnt(mnt, &nd->path, NULL); -out_unlock: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); - if (!err) - security_sb_post_addmount(mnt, nd); - return err; +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~(MS_REC | MS_SILENT); + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; } /* * recursively change the type of the mountpoint. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_change_type(struct nameidata *nd, int flag) +static int do_change_type(struct path *path, int flag) { - struct vfsmount *m, *mnt = nd->path.mnt; + struct mount *m; + struct mount *mnt = real_mount(path->mnt); int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; + int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; - if (nd->path.dentry != nd->path.mnt->mnt_root) + type = flags_to_propagation_type(flag); + if (!type) return -EINVAL; - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + namespace_lock(); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + + lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - return 0; + unlock_mount_hash(); + + out_unlock: + namespace_unlock(); + return err; +} + +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; } /* * do loopback mount. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_loopback(struct nameidata *nd, char *old_name, +static int do_loopback(struct path *path, const char *old_name, int recurse) { - struct nameidata old_nd; - struct vfsmount *mnt = NULL; - int err = mount_is_safe(nd); - if (err) - return err; + struct path old_path; + struct mount *mnt = NULL, *old, *parent; + struct mountpoint *mp; + int err; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; - down_write(&namespace_sem); err = -EINVAL; - if (IS_MNT_UNBINDABLE(old_nd.path.mnt)) - goto out; + if (mnt_ns_loop(old_path.dentry)) + goto out; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; - err = -ENOMEM; + old = real_mount(old_path.mnt); + parent = real_mount(path->mnt); + + err = -EINVAL; + if (IS_MNT_UNBINDABLE(old)) + goto out2; + + if (!check_mnt(parent) || !check_mnt(old)) + goto out2; + + if (!recurse && has_locked_children(old, old_path.dentry)) + goto out2; + if (recurse) - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = clone_mnt(old, old_path.dentry, 0); - if (!mnt) - goto out; + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out2; + } - err = graft_tree(mnt, nd); + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + err = graft_tree(mnt, parent, mp); if (err) { - LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); - umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); - release_mounts(&umount_list); + lock_mount_hash(); + umount_tree(mnt, 0); + unlock_mount_hash(); } - +out2: + unlock_mount(mp); out: - up_write(&namespace_sem); - path_put(&old_nd.path); + path_put(&old_path); return err; } +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (mnt->mnt_flags & MNT_LOCK_READONLY) + return -EPERM; + + if (readonly_request) + error = mnt_make_readonly(real_mount(mnt)); + else + __mnt_unmake_readonly(real_mount(mnt)); + return error; +} + /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. - * noinline this do_mount helper to save do_mount stack space. */ -static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, +static int do_remount(struct path *path, int flags, int mnt_flags, void *data) { int err; - struct super_block *sb = nd->path.mnt->mnt_sb; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); - if (!check_mnt(nd->path.mnt)) + if (!check_mnt(mnt)) return -EINVAL; - if (nd->path.dentry != nd->path.mnt->mnt_root) + if (path->dentry != path->mnt->mnt_root) return -EINVAL; + err = security_sb_remount(sb, data); + if (err) + return err; + down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); - if (!err) - nd->path.mnt->mnt_flags = mnt_flags; + if (flags & MS_BIND) + err = change_mount_flags(path->mnt, flags); + else if (!capable(CAP_SYS_ADMIN)) + err = -EPERM; + else + err = do_remount_sb(sb, flags, data, 0); + if (!err) { + lock_mount_hash(); + mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt->mnt.mnt_flags = mnt_flags; + touch_mnt_namespace(mnt->mnt_ns); + unlock_mount_hash(); + } up_write(&sb->s_umount); - if (!err) - security_sb_post_remount(nd->path.mnt, flags, data); return err; } -static inline int tree_contains_unbindable(struct vfsmount *mnt) +static inline int tree_contains_unbindable(struct mount *mnt) { - struct vfsmount *p; + struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; @@ -1060,153 +1956,235 @@ static inline int tree_contains_unbindable(struct vfsmount *mnt) return 0; } -/* - * noinline this do_mount helper to save do_mount stack space. - */ -static noinline int do_move_mount(struct nameidata *nd, char *old_name) +static int do_move_mount(struct path *path, const char *old_name) { - struct nameidata old_nd; - struct path parent_path; - struct vfsmount *p; - int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + struct path old_path, parent_path; + struct mount *p; + struct mount *old; + struct mountpoint *mp; + int err; if (!old_name || !*old_name) return -EINVAL; - err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd); + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; - down_write(&namespace_sem); - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) - ; - err = -EINVAL; - if (!check_mnt(nd->path.mnt) || !check_mnt(old_nd.path.mnt)) + mp = lock_mount(path); + err = PTR_ERR(mp); + if (IS_ERR(mp)) goto out; - err = -ENOENT; - mutex_lock(&nd->path.dentry->d_inode->i_mutex); - if (IS_DEADDIR(nd->path.dentry->d_inode)) + old = real_mount(old_path.mnt); + p = real_mount(path->mnt); + + err = -EINVAL; + if (!check_mnt(p) || !check_mnt(old)) goto out1; - if (!IS_ROOT(nd->path.dentry) && d_unhashed(nd->path.dentry)) + if (old->mnt.mnt_flags & MNT_LOCKED) goto out1; err = -EINVAL; - if (old_nd.path.dentry != old_nd.path.mnt->mnt_root) + if (old_path.dentry != old_path.mnt->mnt_root) goto out1; - if (old_nd.path.mnt == old_nd.path.mnt->mnt_parent) + if (!mnt_has_parent(old)) goto out1; - if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != - S_ISDIR(old_nd.path.dentry->d_inode->i_mode)) + if (S_ISDIR(path->dentry->d_inode->i_mode) != + S_ISDIR(old_path.dentry->d_inode->i_mode)) goto out1; /* * Don't move a mount residing in a shared parent. */ - if (old_nd.path.mnt->mnt_parent && - IS_MNT_SHARED(old_nd.path.mnt->mnt_parent)) + if (IS_MNT_SHARED(old->mnt_parent)) goto out1; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(nd->path.mnt) && - tree_contains_unbindable(old_nd.path.mnt)) + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out1; err = -ELOOP; - for (p = nd->path.mnt; p->mnt_parent != p; p = p->mnt_parent) - if (p == old_nd.path.mnt) + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) goto out1; - err = attach_recursive_mnt(old_nd.path.mnt, &nd->path, &parent_path); + err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path); if (err) goto out1; /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_nd.path.mnt->mnt_expire); + list_del_init(&old->mnt_expire); out1: - mutex_unlock(&nd->path.dentry->d_inode->i_mutex); + unlock_mount(mp); out: - up_write(&namespace_sem); if (!err) path_put(&parent_path); - path_put(&old_nd.path); + path_put(&old_path); return err; } -/* - * create a new mount for userspace and request it to be added into the - * namespace's tree - * noinline this do_mount helper to save do_mount stack space. - */ -static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, - int mnt_flags, char *name, void *data) +static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) { - struct vfsmount *mnt; - - if (!type || !memchr(type, 0, PAGE_SIZE)) - return -EINVAL; - - /* we need capabilities... */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int err; + const char *subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + err = -EINVAL; + if (!subtype[0]) + goto err; + } else + subtype = ""; - mnt = do_kern_mount(type, flags, name, data); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); + err = -ENOMEM; + if (!mnt->mnt_sb->s_subtype) + goto err; + return mnt; - return do_add_mount(mnt, nd, mnt_flags, NULL); + err: + mntput(mnt); + return ERR_PTR(err); } /* * add a mount into a namespace's mount tree - * - provide the option of adding the new mount to an expiration list */ -int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, - int mnt_flags, struct list_head *fslist) +static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) { + struct mountpoint *mp; + struct mount *parent; int err; - down_write(&namespace_sem); - /* Something was mounted here while we slept */ - while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) - ; + mnt_flags &= ~MNT_INTERNAL_FLAGS; + + mp = lock_mount(path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + + parent = real_mount(path->mnt); err = -EINVAL; - if (!check_mnt(nd->path.mnt)) - goto unlock; + if (unlikely(!check_mnt(parent))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + goto unlock; + /* ... and for those we'd better have mountpoint still alive */ + if (!parent->mnt_ns) + goto unlock; + } /* Refuse the same filesystem on the same mount point */ err = -EBUSY; - if (nd->path.mnt->mnt_sb == newmnt->mnt_sb && - nd->path.mnt->mnt_root == nd->path.dentry) + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && + path->mnt->mnt_root == path->dentry) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode)) + if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) goto unlock; - newmnt->mnt_flags = mnt_flags; - if ((err = graft_tree(newmnt, nd))) - goto unlock; + newmnt->mnt.mnt_flags = mnt_flags; + err = graft_tree(newmnt, parent, mp); - if (fslist) /* add to the specified expiration list */ - list_add_tail(&newmnt->mnt_expire, fslist); +unlock: + unlock_mount(mp); + return err; +} - up_write(&namespace_sem); - return 0; +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, const char *fstype, int flags, + int mnt_flags, const char *name, void *data) +{ + struct file_system_type *type; + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct vfsmount *mnt; + int err; -unlock: - up_write(&namespace_sem); - mntput(newmnt); + if (!fstype) + return -EINVAL; + + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (user_ns != &init_user_ns) { + if (!(type->fs_flags & FS_USERNS_MOUNT)) { + put_filesystem(type); + return -EPERM; + } + /* Only in special cases allow devices from mounts + * created outside the initial user namespace. + */ + if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { + flags |= MS_NODEV; + mnt_flags |= MNT_NODEV; + } + } + + mnt = vfs_kern_mount(type, flags, name, data); + if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && + !mnt->mnt_sb->s_subtype) + mnt = fs_set_subtype(mnt, fstype); + + put_filesystem(type); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + err = do_add_mount(real_mount(mnt), path, mnt_flags); + if (err) + mntput(mnt); return err; } -EXPORT_SYMBOL_GPL(do_add_mount); +int finish_automount(struct vfsmount *m, struct path *path) +{ + struct mount *mnt = real_mount(m); + int err; + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(mnt) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == path->dentry) { + err = -ELOOP; + goto fail; + } + + err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (!err) + return 0; +fail: + /* remove m from any expiration list it may be on */ + if (!list_empty(&mnt->mnt_expire)) { + namespace_lock(); + list_del_init(&mnt->mnt_expire); + namespace_unlock(); + } + mntput(m); + mntput(m); + return err; +} + +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + namespace_lock(); + + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); + + namespace_unlock(); +} +EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any @@ -1215,15 +2193,14 @@ EXPORT_SYMBOL_GPL(do_add_mount); */ void mark_mounts_for_expiry(struct list_head *mounts) { - struct vfsmount *mnt, *next; + struct mount *mnt, *next; LIST_HEAD(graveyard); - LIST_HEAD(umounts); if (list_empty(mounts)) return; - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + namespace_lock(); + lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -1238,14 +2215,12 @@ void mark_mounts_for_expiry(struct list_head *mounts) list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { - mnt = list_first_entry(&graveyard, struct vfsmount, mnt_expire); + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, &umounts); + umount_tree(mnt, 1); } - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - - release_mounts(&umounts); + unlock_mount_hash(); + namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -1256,9 +2231,9 @@ EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ -static int select_submounts(struct vfsmount *parent, struct list_head *graveyard) +static int select_submounts(struct mount *parent, struct list_head *graveyard) { - struct vfsmount *this_parent = parent; + struct mount *this_parent = parent; struct list_head *next; int found = 0; @@ -1267,10 +2242,10 @@ repeat: resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; - struct vfsmount *mnt = list_entry(tmp, struct vfsmount, mnt_child); + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; - if (!(mnt->mnt_flags & MNT_SHRINKABLE)) + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. @@ -1299,19 +2274,21 @@ resume: /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint + * + * mount_lock must be held for write */ -static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) +static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); - struct vfsmount *m; + struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { - m = list_first_entry(&graveyard, struct vfsmount, + m = list_first_entry(&graveyard, struct mount, mnt_expire); - touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1, umounts); + touch_mnt_namespace(m->mnt_ns); + umount_tree(m, 1); } } } @@ -1377,6 +2354,23 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } +int copy_mount_string(const void __user *data, char **where) +{ + char *tmp; + + if (!data) { + *where = NULL; + return 0; + } + + tmp = strndup_user(data, PAGE_SIZE); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + *where = tmp; + return 0; +} + /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). @@ -1391,10 +2385,10 @@ int copy_mount_options(const void __user * data, unsigned long *where) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -long do_mount(char *dev_name, char *dir_name, char *type_page, - unsigned long flags, void *data_page) +long do_mount(const char *dev_name, const char *dir_name, + const char *type_page, unsigned long flags, void *data_page) { - struct nameidata nd; + struct path path; int retval = 0; int mnt_flags = 0; @@ -1406,12 +2400,26 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) return -EINVAL; - if (dev_name && !memchr(dev_name, 0, PAGE_SIZE)) - return -EINVAL; if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; + /* ... and get the mountpoint */ + retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + if (retval) + return retval; + + retval = security_sb_mount(dev_name, &path, + type_page, flags, data_page); + if (!retval && !may_mount()) + retval = -EPERM; + if (retval) + goto dput_out; + + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; @@ -1423,227 +2431,257 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; - if (flags & MS_RELATIME) - mnt_flags |= MNT_RELATIME; - - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; - /* ... and get the mountpoint */ - retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); - if (retval) - return retval; - - retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); - if (retval) - goto dput_out; + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); if (flags & MS_REMOUNT) - retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&path, dev_name, flags & MS_REC); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) - retval = do_change_type(&nd, flags); + retval = do_change_type(&path, flags); else if (flags & MS_MOVE) - retval = do_move_mount(&nd, dev_name); + retval = do_move_mount(&path, dev_name); else - retval = do_new_mount(&nd, type_page, flags, mnt_flags, + retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); dput_out: - path_put(&nd.path); + path_put(&path); return retval; } +static void free_mnt_ns(struct mnt_namespace *ns) +{ + proc_free_inum(ns->proc_inum); + put_user_ns(ns->user_ns); + kfree(ns); +} + /* - * Allocate a new namespace structure and populate it with contents - * copied from the namespace of the passed in task structure. + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. */ -static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, - struct fs_struct *fs) +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct vfsmount *p, *q; + int ret; new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); - + ret = proc_alloc_inum(&new_ns->proc_inum); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); + new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); new_ns->event = 0; + new_ns->user_ns = get_user_ns(user_ns); + return new_ns; +} - down_write(&namespace_sem); +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct user_namespace *user_ns, struct fs_struct *new_fs) +{ + struct mnt_namespace *new_ns; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct mount *p, *q; + struct mount *old; + struct mount *new; + int copy_flags; + + BUG_ON(!ns); + + if (likely(!(flags & CLONE_NEWNS))) { + get_mnt_ns(ns); + return ns; + } + + old = ns->root; + + new_ns = alloc_mnt_ns(user_ns); + if (IS_ERR(new_ns)) + return new_ns; + + namespace_lock(); /* First pass: copy the tree topology */ - new_ns->root = copy_tree(mnt_ns->root, mnt_ns->root->mnt_root, - CL_COPY_ALL | CL_EXPIRE); - if (!new_ns->root) { - up_write(&namespace_sem); - kfree(new_ns); - return ERR_PTR(-ENOMEM);; + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + if (user_ns != ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); + if (IS_ERR(new)) { + namespace_unlock(); + free_mnt_ns(new_ns); + return ERR_CAST(new); } - spin_lock(&vfsmount_lock); - list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&vfsmount_lock); + new_ns->root = new; + list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ - p = mnt_ns->root; - q = new_ns->root; + p = old; + q = new; while (p) { q->mnt_ns = new_ns; - if (fs) { - if (p == fs->root.mnt) { - rootmnt = p; - fs->root.mnt = mntget(q); + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + new_fs->root.mnt = mntget(&q->mnt); + rootmnt = &p->mnt; } - if (p == fs->pwd.mnt) { - pwdmnt = p; - fs->pwd.mnt = mntget(q); - } - if (p == fs->altroot.mnt) { - altrootmnt = p; - fs->altroot.mnt = mntget(q); + if (&p->mnt == new_fs->pwd.mnt) { + new_fs->pwd.mnt = mntget(&q->mnt); + pwdmnt = &p->mnt; } } - p = next_mnt(p, mnt_ns->root); - q = next_mnt(q, new_ns->root); + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); } - up_write(&namespace_sem); + namespace_unlock(); if (rootmnt) mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); - if (altrootmnt) - mntput(altrootmnt); return new_ns; } -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, - struct fs_struct *new_fs) -{ - struct mnt_namespace *new_ns; +/** + * create_mnt_ns - creates a private namespace and adds a root filesystem + * @mnt: pointer to the new root filesystem mountpoint + */ +static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) +{ + struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); + if (!IS_ERR(new_ns)) { + struct mount *mnt = real_mount(m); + mnt->mnt_ns = new_ns; + new_ns->root = mnt; + list_add(&mnt->mnt_list, &new_ns->list); + } else { + mntput(m); + } + return new_ns; +} - BUG_ON(!ns); - get_mnt_ns(ns); +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; - if (!(flags & CLONE_NEWNS)) - return ns; + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); - new_ns = dup_mnt_ns(ns, new_fs); + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); - return new_ns; -} -asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, - char __user * type, unsigned long flags, - void __user * data) -{ - int retval; + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + int ret; + char *kernel_type; + struct filename *kernel_dir; + char *kernel_dev; unsigned long data_page; - unsigned long type_page; - unsigned long dev_page; - char *dir_page; - - retval = copy_mount_options(type, &type_page); - if (retval < 0) - return retval; - dir_page = getname(dir_name); - retval = PTR_ERR(dir_page); - if (IS_ERR(dir_page)) - goto out1; - - retval = copy_mount_options(dev_name, &dev_page); - if (retval < 0) - goto out2; + ret = copy_mount_string(type, &kernel_type); + if (ret < 0) + goto out_type; - retval = copy_mount_options(data, &data_page); - if (retval < 0) - goto out3; + kernel_dir = getname(dir_name); + if (IS_ERR(kernel_dir)) { + ret = PTR_ERR(kernel_dir); + goto out_dir; + } - lock_kernel(); - retval = do_mount((char *)dev_page, dir_page, (char *)type_page, - flags, (void *)data_page); - unlock_kernel(); - free_page(data_page); + ret = copy_mount_string(dev_name, &kernel_dev); + if (ret < 0) + goto out_dev; -out3: - free_page(dev_page); -out2: - putname(dir_page); -out1: - free_page(type_page); - return retval; -} + ret = copy_mount_options(data, &data_page); + if (ret < 0) + goto out_data; -/* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_root(struct fs_struct *fs, struct path *path) -{ - struct path old_root; + ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, + (void *) data_page); - write_lock(&fs->lock); - old_root = fs->root; - fs->root = *path; - path_get(path); - write_unlock(&fs->lock); - if (old_root.dentry) - path_put(&old_root); + free_page(data_page); +out_data: + kfree(kernel_dev); +out_dev: + putname(kernel_dir); +out_dir: + kfree(kernel_type); +out_type: + return ret; } /* - * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. + * Return true if path is reachable from root + * + * namespace_sem or mount_lock is held */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) { - struct path old_pwd; - - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); - - if (old_pwd.dentry) - path_put(&old_pwd); + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -static void chroot_fs_refs(struct path *old_root, struct path *new_root) +int path_is_under(struct path *path1, struct path *path2) { - struct task_struct *g, *p; - struct fs_struct *fs; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(p); - if (fs->root.dentry == old_root->dentry - && fs->root.mnt == old_root->mnt) - set_fs_root(fs, new_root); - if (fs->pwd.dentry == old_root->dentry - && fs->pwd.mnt == old_root->mnt) - set_fs_pwd(fs, new_root); - put_fs_struct(fs); - } else - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + int res; + read_seqlock_excl(&mount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + read_sequnlock_excl(&mount_lock); + return res; } +EXPORT_SYMBOL(path_is_under); /* * pivot_root Semantics: @@ -1670,111 +2708,97 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ -asmlinkage long sys_pivot_root(const char __user * new_root, - const char __user * put_old) +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) { - struct vfsmount *tmp; - struct nameidata new_nd, old_nd, user_nd; - struct path parent_path, root_parent; + struct path new, old, parent_path, root_parent, root; + struct mount *new_mnt, *root_mnt, *old_mnt; + struct mountpoint *old_mp, *root_mp; int error; - if (!capable(CAP_SYS_ADMIN)) + if (!may_mount()) return -EPERM; - lock_kernel(); - - error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, - &new_nd); + error = user_path_dir(new_root, &new); if (error) goto out0; - error = -EINVAL; - if (!check_mnt(new_nd.path.mnt)) - goto out1; - error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd); + error = user_path_dir(put_old, &old); if (error) goto out1; - error = security_sb_pivotroot(&old_nd, &new_nd); - if (error) { - path_put(&old_nd.path); - goto out1; - } + error = security_sb_pivotroot(&old, &new); + if (error) + goto out2; + + get_fs_root(current->fs, &root); + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) + goto out3; - read_lock(¤t->fs->lock); - user_nd.path = current->fs->root; - path_get(¤t->fs->root); - read_unlock(¤t->fs->lock); - down_write(&namespace_sem); - mutex_lock(&old_nd.path.dentry->d_inode->i_mutex); error = -EINVAL; - if (IS_MNT_SHARED(old_nd.path.mnt) || - IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || - IS_MNT_SHARED(user_nd.path.mnt->mnt_parent)) - goto out2; - if (!check_mnt(user_nd.path.mnt)) - goto out2; + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + old_mnt = real_mount(old.mnt); + if (IS_MNT_SHARED(old_mnt) || + IS_MNT_SHARED(new_mnt->mnt_parent) || + IS_MNT_SHARED(root_mnt->mnt_parent)) + goto out4; + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) + goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; error = -ENOENT; - if (IS_DEADDIR(new_nd.path.dentry->d_inode)) - goto out2; - if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry)) - goto out2; - if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) - goto out2; + if (d_unlinked(new.dentry)) + goto out4; error = -EBUSY; - if (new_nd.path.mnt == user_nd.path.mnt || - old_nd.path.mnt == user_nd.path.mnt) - goto out2; /* loop, on the same file system */ + if (new_mnt == root_mnt || old_mnt == root_mnt) + goto out4; /* loop, on the same file system */ error = -EINVAL; - if (user_nd.path.mnt->mnt_root != user_nd.path.dentry) - goto out2; /* not a mountpoint */ - if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt) - goto out2; /* not attached */ - if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) - goto out2; /* not a mountpoint */ - if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt) - goto out2; /* not attached */ + if (root.mnt->mnt_root != root.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(root_mnt)) + goto out4; /* not attached */ + root_mp = root_mnt->mnt_mp; + if (new.mnt->mnt_root != new.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(new_mnt)) + goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ - tmp = old_nd.path.mnt; - spin_lock(&vfsmount_lock); - if (tmp != new_nd.path.mnt) { - for (;;) { - if (tmp->mnt_parent == tmp) - goto out3; /* already mounted on put_old */ - if (tmp->mnt_parent == new_nd.path.mnt) - break; - tmp = tmp->mnt_parent; - } - if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry)) - goto out3; - } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) - goto out3; - detach_mnt(new_nd.path.mnt, &parent_path); - detach_mnt(user_nd.path.mnt, &root_parent); + if (!is_path_reachable(old_mnt, old.dentry, &new)) + goto out4; + root_mp->m_count++; /* pin it so it won't go away */ + lock_mount_hash(); + detach_mnt(new_mnt, &parent_path); + detach_mnt(root_mnt, &root_parent); + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } /* mount old root on put_old */ - attach_mnt(user_nd.path.mnt, &old_nd.path); + attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ - attach_mnt(new_nd.path.mnt, &root_parent); + attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); - spin_unlock(&vfsmount_lock); - chroot_fs_refs(&user_nd.path, &new_nd.path); - security_sb_post_pivotroot(&user_nd, &new_nd); + unlock_mount_hash(); + chroot_fs_refs(&root, &new); + put_mountpoint(root_mp); error = 0; - path_put(&root_parent); - path_put(&parent_path); +out4: + unlock_mount(old_mp); + if (!error) { + path_put(&root_parent); + path_put(&parent_path); + } +out3: + path_put(&root); out2: - mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); - up_write(&namespace_sem); - path_put(&user_nd.path); - path_put(&old_nd.path); + path_put(&old); out1: - path_put(&new_nd.path); + path_put(&new); out0: - unlock_kernel(); return error; -out3: - spin_unlock(&vfsmount_lock); - goto out2; } static void __init init_mount_tree(void) @@ -1782,26 +2806,25 @@ static void __init init_mount_tree(void) struct vfsmount *mnt; struct mnt_namespace *ns; struct path root; + struct file_system_type *type; - mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); + type = get_fs_type("rootfs"); + if (!type) + panic("Can't find rootfs type"); + mnt = vfs_kern_mount(type, 0, "rootfs", NULL); + put_filesystem(type); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = kmalloc(sizeof(*ns), GFP_KERNEL); - if (!ns) + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - atomic_set(&ns->count, 1); - INIT_LIST_HEAD(&ns->list); - init_waitqueue_head(&ns->poll); - ns->event = 0; - list_add(&mnt->mnt_list, &ns->list); - ns->root = mnt; - mnt->mnt_ns = ns; init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); - root.mnt = ns->root; - root.dentry = ns->root->mnt_root; + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -1812,43 +2835,203 @@ void __init mnt_init(void) unsigned u; int err; - init_rwsem(&namespace_sem); - - mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); - mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); - - if (!mount_hashtable) + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct hlist_head), + mhash_entries, 19, + 0, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + 0, + &mp_hash_shift, &mp_hash_mask, 0, 0); + + if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); + for (u = 0; u <= m_hash_mask; u++) + INIT_HLIST_HEAD(&mount_hashtable[u]); + for (u = 0; u <= mp_hash_mask; u++) + INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - for (u = 0; u < HASH_SIZE; u++) - INIT_LIST_HEAD(&mount_hashtable[u]); + kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", - __FUNCTION__, err); + __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) - printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); + printk(KERN_WARNING "%s: kobj create error\n", __func__); init_rootfs(); init_mount_tree(); } -void __put_mnt_ns(struct mnt_namespace *ns) +void put_mnt_ns(struct mnt_namespace *ns) { - struct vfsmount *root = ns->root; - LIST_HEAD(umount_list); - ns->root = NULL; - spin_unlock(&vfsmount_lock); - down_write(&namespace_sem); - spin_lock(&vfsmount_lock); - umount_tree(root, 0, &umount_list); - spin_unlock(&vfsmount_lock); - up_write(&namespace_sem); - release_mounts(&umount_list); - kfree(ns); + if (!atomic_dec_and_test(&ns->count)) + return; + drop_collected_mounts(&ns->root->mnt); + free_mnt_ns(ns); +} + +struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +{ + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + } + return mnt; +} +EXPORT_SYMBOL_GPL(kern_mount_data); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + real_mount(mnt)->mnt_ns = NULL; + synchronize_rcu(); /* yecchhh... */ + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(real_mount(mnt)); } + +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +bool fs_fully_visible(struct file_system_type *type) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool visible = false; + + if (unlikely(!ns)) + return false; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink > 2) + goto next; + } + visible = true; + goto found; + next: ; + } +found: + up_read(&namespace_sem); + return visible; +} + +static void *mntns_get(struct task_struct *task) +{ + struct mnt_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->mnt_ns; + get_mnt_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void mntns_put(void *ns) +{ + put_mnt_ns(ns); +} + +static int mntns_install(struct nsproxy *nsproxy, void *ns) +{ + struct fs_struct *fs = current->fs; + struct mnt_namespace *mnt_ns = ns; + struct path root; + + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + put_mnt_ns(nsproxy->mnt_ns); + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + root.mnt = &mnt_ns->root->mnt; + root.dentry = mnt_ns->root->mnt.mnt_root; + path_get(&root); + while(d_mountpoint(root.dentry) && follow_down_one(&root)) + ; + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +static unsigned int mntns_inum(void *ns) +{ + struct mnt_namespace *mnt_ns = ns; + return mnt_ns->proc_inum; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, + .inum = mntns_inum, +}; |
