diff options
Diffstat (limited to 'ipc/shm.c')
| -rw-r--r-- | ipc/shm.c | 919 |
1 files changed, 558 insertions, 361 deletions
diff --git a/ipc/shm.c b/ipc/shm.c index cc63fae02f0..89fc354156c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -19,6 +19,9 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> + * + * Better ipc lock (kern_ipc_perm.lock) handling + * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013. */ #include <linux/slab.h> @@ -40,7 +43,7 @@ #include <linux/mount.h> #include <linux/ipc_namespace.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "util.h" @@ -54,18 +57,17 @@ struct shm_file_data { #define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) static const struct file_operations shm_file_operations; -static struct vm_operations_struct shm_vm_ops; +static const struct vm_operations_struct shm_vm_ops; #define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) #define shm_unlock(shp) \ ipc_unlock(&(shp)->shm_perm) -#define shm_buildid(id, seq) ipc_buildid(id, seq) static int newseg(struct ipc_namespace *, struct ipc_params *); static void shm_open(struct vm_area_struct *vma); static void shm_close(struct vm_area_struct *vma); -static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); +static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif @@ -75,20 +77,21 @@ void shm_init_ns(struct ipc_namespace *ns) ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; + ns->shm_rmid_forced = 0; ns->shm_tot = 0; - ipc_init_ids(&ns->ids[IPC_SHM_IDS]); + ipc_init_ids(&shm_ids(ns)); } /* - * Called with shm_ids.rw_mutex (writer) and the shp structure locked. - * Only shm_ids.rw_mutex remains locked on exit. + * Called with shm_ids.rwsem (writer) and the shp structure locked. + * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); - if (shp->shm_nattch){ + if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; /* Do not find it any more */ shp->shm_perm.key = IPC_PRIVATE; @@ -101,47 +104,52 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) void shm_exit_ns(struct ipc_namespace *ns) { free_ipcs(ns, &shm_ids(ns), do_shm_rmid); + idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); } #endif -void __init shm_init (void) +static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); + return 0; +} + +pure_initcall(ipc_ns_init); + +void __init shm_init(void) +{ ipc_init_proc_interface("sysvipc/shm", - " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n", +#if BITS_PER_LONG <= 32 + " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", +#else + " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n", +#endif IPC_SHM_IDS, sysvipc_shm_proc_show); } -/* - * shm_lock_(check_)down routines are called in the paths where the rw_mutex - * is held to protect access to the idr tree. - */ -static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns, - int id) +static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) { - struct kern_ipc_perm *ipcp = ipc_lock_down(&shm_ids(ns), id); + struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id); if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; + return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } -static inline struct shmid_kernel *shm_lock_check_down( - struct ipc_namespace *ns, - int id) +static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) { - struct kern_ipc_perm *ipcp = ipc_lock_check_down(&shm_ids(ns), id); + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; + return ERR_CAST(ipcp); return container_of(ipcp, struct shmid_kernel, shm_perm); } /* - * shm_lock_(check_) routines are called in the paths where the rw_mutex - * is not held. + * shm_lock_(check_) routines are called in the paths where the rwsem + * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) { @@ -153,15 +161,19 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) return container_of(ipcp, struct shmid_kernel, shm_perm); } -static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, - int id) +static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) { - struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); + rcu_read_lock(); + ipc_lock_object(&ipcp->shm_perm); +} - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; +static void shm_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct shmid_kernel *shp = ipc_rcu_to_struct(p); - return container_of(ipcp, struct shmid_kernel, shm_perm); + security_shm_free(shp); + ipc_rcu_free(head); } static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) @@ -169,12 +181,6 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ipc_rmid(&shm_ids(ns), &s->shm_perm); } -static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp) -{ - return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); -} - - /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) @@ -197,22 +203,41 @@ static void shm_open(struct vm_area_struct *vma) * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.rw_mutex (writer) locked, + * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { + struct file *shm_file; + + shm_file = shp->shm_file; + shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid(ns, shp); shm_unlock(shp); - if (!is_file_hugepages(shp->shm_file)) - shmem_lock(shp->shm_file, 0, shp->mlock_user); - else - user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, - shp->mlock_user); - fput (shp->shm_file); - security_shm_free(shp); - ipc_rcu_putref(shp); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_user); + else if (shp->mlock_user) + user_shm_unlock(file_inode(shm_file)->i_size, shp->mlock_user); + fput(shm_file); + ipc_rcu_putref(shp, shm_rcu_free); +} + +/* + * shm_may_destroy - identifies whether shm segment should be destroyed now + * + * Returns true if and only if there are no active users of the segment and + * one of the following is true: + * + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + return (shp->shm_nattch == 0) && + (ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); } /* @@ -223,24 +248,100 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) */ static void shm_close(struct vm_area_struct *vma) { - struct file * file = vma->vm_file; + struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ - shp = shm_lock_down(ns, sfd->id); + shp = shm_lock(ns, sfd->id); BUG_ON(IS_ERR(shp)); shp->shm_lprid = task_tgid_vnr(current); shp->shm_dtim = get_seconds(); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); +} + +/* Called with ns->shm_ids(ns).rwsem locked */ +static int shm_try_destroy_current(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct kern_ipc_perm *ipcp = p; + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + if (shp->shm_creator != current) + return 0; + + /* + * Mark it as orphaned to destroy the segment when + * kernel.shm_rmid_forced is changed. + * It is noop if the following shm_may_destroy() returns true. + */ + shp->shm_creator = NULL; + + /* + * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID + * is not set, it shouldn't be deleted here. + */ + if (!ns->shm_rmid_forced) + return 0; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + return 0; +} + +/* Called with ns->shm_ids(ns).rwsem locked */ +static int shm_try_destroy_orphaned(int id, void *p, void *data) +{ + struct ipc_namespace *ns = data; + struct kern_ipc_perm *ipcp = p; + struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + /* + * We want to destroy segments without users and with already + * exit'ed originating process. + * + * As shp->* are changed under rwsem, it's safe to skip shp locking. + */ + if (shp->shm_creator != NULL) + return 0; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + return 0; +} + +void shm_destroy_orphaned(struct ipc_namespace *ns) +{ + down_write(&shm_ids(ns).rwsem); + if (shm_ids(ns).in_use) + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + up_write(&shm_ids(ns).rwsem); +} + + +void exit_shm(struct task_struct *task) +{ + struct ipc_namespace *ns = task->nsproxy->ipc_ns; + + if (shm_ids(ns).in_use == 0) + return; + + /* Destroy all already created segments, but not mapped yet */ + down_write(&shm_ids(ns).rwsem); + if (shm_ids(ns).in_use) + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + up_write(&shm_ids(ns).rwsem); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -271,16 +372,14 @@ static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, if (sfd->vm_ops->get_policy) pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) { + else if (vma->vm_policy) pol = vma->vm_policy; - mpol_get(pol); /* get_vma_policy() expects this */ - } else - pol = current->mempolicy; + return pol; } #endif -static int shm_mmap(struct file * file, struct vm_area_struct * vma) +static int shm_mmap(struct file *file, struct vm_area_struct *vma) { struct shm_file_data *sfd = shm_file_data(file); int ret; @@ -308,46 +407,60 @@ static int shm_release(struct inode *ino, struct file *file) return 0; } -static int shm_fsync(struct file *file, struct dentry *dentry, int datasync) +static int shm_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - int (*fsync) (struct file *, struct dentry *, int datasync); struct shm_file_data *sfd = shm_file_data(file); - int ret = -EINVAL; - fsync = sfd->file->f_op->fsync; - if (fsync) - ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync); - return ret; + if (!sfd->file->f_op->fsync) + return -EINVAL; + return sfd->file->f_op->fsync(sfd->file, start, end, datasync); } -static unsigned long shm_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags) +static long shm_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) { struct shm_file_data *sfd = shm_file_data(file); - return get_unmapped_area(sfd->file, addr, len, pgoff, flags); + + if (!sfd->file->f_op->fallocate) + return -EOPNOTSUPP; + return sfd->file->f_op->fallocate(file, mode, offset, len); } -int is_file_shm_hugepages(struct file *file) +static unsigned long shm_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) { - int ret = 0; - - if (file->f_op == &shm_file_operations) { - struct shm_file_data *sfd; - sfd = shm_file_data(file); - ret = is_file_hugepages(sfd->file); - } - return ret; + struct shm_file_data *sfd = shm_file_data(file); + return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, + pgoff, flags); } static const struct file_operations shm_file_operations = { .mmap = shm_mmap, .fsync = shm_fsync, .release = shm_release, +#ifndef CONFIG_MMU .get_unmapped_area = shm_get_unmapped_area, +#endif + .llseek = noop_llseek, + .fallocate = shm_fallocate, }; -static struct vm_operations_struct shm_vm_ops = { +static const struct file_operations shm_file_operations_huge = { + .mmap = shm_mmap, + .fsync = shm_fsync, + .release = shm_release, + .get_unmapped_area = shm_get_unmapped_area, + .llseek = noop_llseek, + .fallocate = shm_fallocate, +}; + +int is_file_shm_hugepages(struct file *file) +{ + return file->f_op == &shm_file_operations_huge; +} + +static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .fault = shm_fault, @@ -362,9 +475,8 @@ static struct vm_operations_struct shm_vm_ops = { * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * - * Called with shm_ids.rw_mutex held as a writer. + * Called with shm_ids.rwsem held as a writer. */ - static int newseg(struct ipc_namespace *ns, struct ipc_params *params) { key_t key = params->key; @@ -372,15 +484,20 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) size_t size = params->u.size; int error; struct shmid_kernel *shp; - int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; - struct file * file; + size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + struct file *file; char name[13]; int id; + vm_flags_t acctflag = 0; if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; - if (ns->shm_tot + numpages > ns->shm_ctlall) + if (numpages << PAGE_SHIFT < size) + return -ENOSPC; + + if (ns->shm_tot + numpages < ns->shm_tot || + ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; shp = ipc_rcu_alloc(sizeof(*shp)); @@ -394,31 +511,43 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_perm.security = NULL; error = security_shm_alloc(shp); if (error) { - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, ipc_rcu_free); return error; } - sprintf (name, "SYSV%08x", key); + sprintf(name, "SYSV%08x", key); if (shmflg & SHM_HUGETLB) { - /* hugetlb_file_setup takes care of mlock user accounting */ - file = hugetlb_file_setup(name, size); - shp->mlock_user = current->user; + struct hstate *hs; + size_t hugesize; + + hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); + if (!hs) { + error = -EINVAL; + goto no_file; + } + hugesize = ALIGN(size, huge_page_size(hs)); + + /* hugetlb_file_setup applies strict accounting */ + if (shmflg & SHM_NORESERVE) + acctflag = VM_NORESERVE; + file = hugetlb_file_setup(name, hugesize, acctflag, + &shp->mlock_user, HUGETLB_SHMFS_INODE, + (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { - int acctflag = VM_ACCOUNT; /* * Do not allow no accounting for OVERCOMMIT_NEVER, even - * if it's asked for. + * if it's asked for. */ if ((shmflg & SHM_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) - acctflag = 0; + acctflag = VM_NORESERVE; file = shmem_file_setup(name, size, acctflag); } error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; - id = shm_addid(ns, shp); + id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (id < 0) { error = id; goto no_id; @@ -430,29 +559,33 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_ctim = get_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; - shp->shm_perm.id = shm_buildid(id, shp->shm_perm.seq); shp->shm_file = file; + shp->shm_creator = current; + /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. */ - file->f_dentry->d_inode->i_ino = shp->shm_perm.id; + file_inode(file)->i_ino = shp->shm_perm.id; ns->shm_tot += numpages; error = shp->shm_perm.id; - shm_unlock(shp); + + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); return error; no_id: + if (is_file_hugepages(file) && shp->mlock_user) + user_shm_unlock(size, shp->mlock_user); fput(file); no_file: - security_shm_free(shp); - ipc_rcu_putref(shp); + ipc_rcu_putref(shp, shm_rcu_free); return error; } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { @@ -463,7 +596,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -477,18 +610,18 @@ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, return 0; } -asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) +SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg) { struct ipc_namespace *ns; - struct ipc_ops shm_ops; + static const struct ipc_ops shm_ops = { + .getnew = newseg, + .associate = shm_security, + .more_checks = shm_more_checks, + }; struct ipc_params shm_params; ns = current->nsproxy->ipc_ns; - shm_ops.getnew = newseg; - shm_ops.associate = shm_security; - shm_ops.more_checks = shm_more_checks; - shm_params.key = key; shm_params.flg = shmflg; shm_params.u.size = size; @@ -498,13 +631,14 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ds *in, int version) { - switch(version) { + switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shmid_ds out; + memset(&out, 0, sizeof(out)); ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm); out.shm_segsz = in->shm_segsz; out.shm_atime = in->shm_atime; @@ -521,28 +655,14 @@ static inline unsigned long copy_shmid_to_user(void __user *buf, struct shmid64_ } } -struct shm_setbuf { - uid_t uid; - gid_t gid; - mode_t mode; -}; - -static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __user *buf, int version) +static inline unsigned long +copy_shmid_from_user(struct shmid64_ds *out, void __user *buf, int version) { - switch(version) { + switch (version) { case IPC_64: - { - struct shmid64_ds tbuf; - - if (copy_from_user(&tbuf, buf, sizeof(tbuf))) + if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; - - out->uid = tbuf.shm_perm.uid; - out->gid = tbuf.shm_perm.gid; - out->mode = tbuf.shm_perm.mode; - return 0; - } case IPC_OLD: { struct shmid_ds tbuf_old; @@ -550,9 +670,9 @@ static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __ if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; - out->uid = tbuf_old.shm_perm.uid; - out->gid = tbuf_old.shm_perm.gid; - out->mode = tbuf_old.shm_perm.mode; + out->shm_perm.uid = tbuf_old.shm_perm.uid; + out->shm_perm.gid = tbuf_old.shm_perm.gid; + out->shm_perm.mode = tbuf_old.shm_perm.mode; return 0; } @@ -563,14 +683,14 @@ static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void __ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminfo64 *in, int version) { - switch(version) { + switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: { struct shminfo out; - if(in->shmmax > INT_MAX) + if (in->shmmax > INT_MAX) out.shmmax = INT_MAX; else out.shmmax = (int)in->shmmax; @@ -578,7 +698,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf out.shmmin = in->shmmin; out.shmmni = in->shmmni; out.shmseg = in->shmseg; - out.shmall = in->shmall; + out.shmall = in->shmall; return copy_to_user(buf, &out, sizeof(out)); } @@ -588,7 +708,35 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf } /* - * Called with shm_ids.rw_mutex held as a reader + * Calculate and add used RSS and swap pages of a shm. + * Called with shm_ids.rwsem held as a reader + */ +static void shm_add_rss_swap(struct shmid_kernel *shp, + unsigned long *rss_add, unsigned long *swp_add) +{ + struct inode *inode; + + inode = file_inode(shp->shm_file); + + if (is_file_hugepages(shp->shm_file)) { + struct address_space *mapping = inode->i_mapping; + struct hstate *h = hstate_file(shp->shm_file); + *rss_add += pages_per_huge_page(h) * mapping->nrpages; + } else { +#ifdef CONFIG_SHMEM + struct shmem_inode_info *info = SHMEM_I(inode); + spin_lock(&info->lock); + *rss_add += inode->i_mapping->nrpages; + *swp_add += info->swapped; + spin_unlock(&info->lock); +#else + *rss_add += inode->i_mapping->nrpages; +#endif + } +} + +/* + * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) @@ -602,68 +750,113 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, in_use = shm_ids(ns).in_use; for (total = 0, next_id = 0; total < in_use; next_id++) { + struct kern_ipc_perm *ipc; struct shmid_kernel *shp; - struct inode *inode; - shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); - if (shp == NULL) + ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id); + if (ipc == NULL) continue; + shp = container_of(ipc, struct shmid_kernel, shm_perm); - inode = shp->shm_file->f_path.dentry->d_inode; - - if (is_file_hugepages(shp->shm_file)) { - struct address_space *mapping = inode->i_mapping; - *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - } else { - struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); - *rss += inode->i_mapping->nrpages; - *swp += info->swapped; - spin_unlock(&info->lock); - } + shm_add_rss_swap(shp, rss, swp); total++; } } -asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) +/* + * This function handles some shmctl commands which require the rwsem + * to be held in write mode. + * NOTE: no locks must be held, the rwsem is taken inside this function. + */ +static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, + struct shmid_ds __user *buf, int version) { - struct shm_setbuf setbuf; + struct kern_ipc_perm *ipcp; + struct shmid64_ds shmid64; struct shmid_kernel *shp; - int err, version; - struct ipc_namespace *ns; + int err; - if (cmd < 0 || shmid < 0) { + if (cmd == IPC_SET) { + if (copy_shmid_from_user(&shmid64, buf, version)) + return -EFAULT; + } + + down_write(&shm_ids(ns).rwsem); + rcu_read_lock(); + + ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + goto out_unlock1; + } + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); + + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock1; + + switch (cmd) { + case IPC_RMID: + ipc_lock_object(&shp->shm_perm); + /* do_shm_rmid unlocks the ipc object and rcu */ + do_shm_rmid(ns, ipcp); + goto out_up; + case IPC_SET: + ipc_lock_object(&shp->shm_perm); + err = ipc_update_perm(&shmid64.shm_perm, ipcp); + if (err) + goto out_unlock0; + shp->shm_ctim = get_seconds(); + break; + default: err = -EINVAL; - goto out; + goto out_unlock1; } - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); +out_up: + up_write(&shm_ids(ns).rwsem); + return err; +} - switch (cmd) { /* replace with proc interface ? */ - case IPC_INFO: - { - struct shminfo64 shminfo; +static int shmctl_nolock(struct ipc_namespace *ns, int shmid, + int cmd, int version, void __user *buf) +{ + int err; + struct shmid_kernel *shp; + /* preliminary security checks for *_INFO */ + if (cmd == IPC_INFO || cmd == SHM_INFO) { err = security_shm_shmctl(NULL, cmd); if (err) return err; + } - memset(&shminfo,0,sizeof(shminfo)); + switch (cmd) { + case IPC_INFO: + { + struct shminfo64 shminfo; + + memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; shminfo.shmmax = ns->shm_ctlmax; shminfo.shmall = ns->shm_ctlall; shminfo.shmmin = SHMMIN; - if(copy_shminfo_to_user (buf, &shminfo, version)) + if (copy_shminfo_to_user(buf, &shminfo, version)) return -EFAULT; - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); - if(err<0) + if (err < 0) err = 0; goto out; } @@ -671,20 +864,16 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) { struct shm_info shm_info; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - - memset(&shm_info,0,sizeof(shm_info)); - down_read(&shm_ids(ns).rw_mutex); + memset(&shm_info, 0, sizeof(shm_info)); + down_read(&shm_ids(ns).rwsem); shm_info.used_ids = shm_ids(ns).in_use; - shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); + shm_get_stat(ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); - if(copy_to_user (buf, &shm_info, sizeof(shm_info))) { + up_read(&shm_ids(ns).rwsem); + if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; } @@ -698,32 +887,31 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) struct shmid64_ds tbuf; int result; - if (!buf) { - err = -EFAULT; - goto out; - } - + rcu_read_lock(); if (cmd == SHM_STAT) { - shp = shm_lock(ns, shmid); + shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = shp->shm_perm.id; } else { - shp = shm_lock_check(ns, shmid); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = 0; } - err=-EACCES; - if (ipcperms (&shp->shm_perm, S_IRUGO)) + + err = -EACCES; + if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; + err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; @@ -733,152 +921,119 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; tbuf.shm_nattch = shp->shm_nattch; - shm_unlock(shp); - if(copy_shmid_to_user (buf, &tbuf, version)) + rcu_read_unlock(); + + if (copy_shmid_to_user(buf, &tbuf, version)) err = -EFAULT; else err = result; goto out; } - case SHM_LOCK: - case SHM_UNLOCK: - { - shp = shm_lock_check(ns, shmid); - if (IS_ERR(shp)) { - err = PTR_ERR(shp); - goto out; - } + default: + return -EINVAL; + } - err = audit_ipc_obj(&(shp->shm_perm)); - if (err) - goto out_unlock; +out_unlock: + rcu_read_unlock(); +out: + return err; +} - if (!capable(CAP_IPC_LOCK)) { - err = -EPERM; - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid) - goto out_unlock; - if (cmd == SHM_LOCK && - !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) - goto out_unlock; - } +SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + struct shmid_kernel *shp; + int err, version; + struct ipc_namespace *ns; - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock; - - if(cmd==SHM_LOCK) { - struct user_struct * user = current->user; - if (!is_file_hugepages(shp->shm_file)) { - err = shmem_lock(shp->shm_file, 1, user); - if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ - shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; - } - } - } else if (!is_file_hugepages(shp->shm_file)) { - shmem_lock(shp->shm_file, 0, shp->mlock_user); - shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; - } - shm_unlock(shp); - goto out; - } + if (cmd < 0 || shmid < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case SHM_INFO: + case SHM_STAT: + case IPC_STAT: + return shmctl_nolock(ns, shmid, cmd, version, buf); case IPC_RMID: + case IPC_SET: + return shmctl_down(ns, shmid, cmd, buf, version); + case SHM_LOCK: + case SHM_UNLOCK: { - /* - * We cannot simply remove the file. The SVID states - * that the block remains until the last person - * detaches from it, then is deleted. A shmat() on - * an RMID segment is legal in older Linux and if - * we change it apps break... - * - * Instead we set a destroyed flag, and then blow - * the name away when the usage hits zero. - */ - down_write(&shm_ids(ns).rw_mutex); - shp = shm_lock_check_down(ns, shmid); + struct file *shm_file; + + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out_up; - } - - err = audit_ipc_obj(&(shp->shm_perm)); - if (err) - goto out_unlock_up; - - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { - err=-EPERM; - goto out_unlock_up; + goto out_unlock1; } + audit_ipc_obj(&(shp->shm_perm)); err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock_up; + goto out_unlock1; - do_shm_rmid(ns, &shp->shm_perm); - up_write(&shm_ids(ns).rw_mutex); - goto out; - } + ipc_lock_object(&shp->shm_perm); - case IPC_SET: - { - if (!buf) { - err = -EFAULT; - goto out; + /* check if shm_destroy() is tearing down shp */ + if (!ipc_valid_object(&shp->shm_perm)) { + err = -EIDRM; + goto out_unlock0; } - if (copy_shmid_from_user (&setbuf, buf, version)) { - err = -EFAULT; - goto out; - } - down_write(&shm_ids(ns).rw_mutex); - shp = shm_lock_check_down(ns, shmid); - if (IS_ERR(shp)) { - err = PTR_ERR(shp); - goto out_up; + if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { + kuid_t euid = current_euid(); + if (!uid_eq(euid, shp->shm_perm.uid) && + !uid_eq(euid, shp->shm_perm.cuid)) { + err = -EPERM; + goto out_unlock0; + } + if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) { + err = -EPERM; + goto out_unlock0; + } } - err = audit_ipc_obj(&(shp->shm_perm)); - if (err) - goto out_unlock_up; - err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode); - if (err) - goto out_unlock_up; - err=-EPERM; - if (current->euid != shp->shm_perm.uid && - current->euid != shp->shm_perm.cuid && - !capable(CAP_SYS_ADMIN)) { - goto out_unlock_up; + + shm_file = shp->shm_file; + if (is_file_hugepages(shm_file)) + goto out_unlock0; + + if (cmd == SHM_LOCK) { + struct user_struct *user = current_user(); + err = shmem_lock(shm_file, 1, user); + if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { + shp->shm_perm.mode |= SHM_LOCKED; + shp->mlock_user = user; + } + goto out_unlock0; } - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock_up; - - shp->shm_perm.uid = setbuf.uid; - shp->shm_perm.gid = setbuf.gid; - shp->shm_perm.mode = (shp->shm_perm.mode & ~S_IRWXUGO) - | (setbuf.mode & S_IRWXUGO); - shp->shm_ctim = get_seconds(); - break; + /* SHM_UNLOCK */ + if (!(shp->shm_perm.mode & SHM_LOCKED)) + goto out_unlock0; + shmem_lock(shm_file, 0, shp->mlock_user); + shp->shm_perm.mode &= ~SHM_LOCKED; + shp->mlock_user = NULL; + get_file(shm_file); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); + shmem_unlock_mapping(shm_file->f_mapping); + + fput(shm_file); + return err; } - default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = 0; -out_unlock_up: - shm_unlock(shp); -out_up: - up_write(&shm_ids(ns).rw_mutex); - goto out; -out_unlock: - shm_unlock(shp); -out: +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); return err; } @@ -889,29 +1044,30 @@ out: * "raddr" thing points to kernel space, and there has to be a wrapper around * this. */ -long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) +long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, + unsigned long shmlba) { struct shmid_kernel *shp; unsigned long addr; unsigned long size; - struct file * file; + struct file *file; int err; unsigned long flags; unsigned long prot; int acc_mode; - unsigned long user_addr; struct ipc_namespace *ns; struct shm_file_data *sfd; struct path path; - mode_t f_mode; + fmode_t f_mode; + unsigned long populate = 0; err = -EINVAL; if (shmid < 0) goto out; else if ((addr = (ulong)shmaddr)) { - if (addr & (SHMLBA-1)) { + if (addr & (shmlba - 1)) { if (shmflg & SHM_RND) - addr &= ~(SHMLBA-1); /* round down */ + addr &= ~(shmlba - 1); /* round down */ else #ifndef __ARCH_FORCE_SHMLBA if (addr & ~PAGE_MASK) @@ -945,36 +1101,54 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } err = -EACCES; - if (ipcperms(&shp->shm_perm, acc_mode)) + if (ipcperms(ns, &shp->shm_perm, acc_mode)) goto out_unlock; err = security_shm_shmat(shp, shmaddr, shmflg); if (err) goto out_unlock; - path.dentry = dget(shp->shm_file->f_path.dentry); - path.mnt = shp->shm_file->f_path.mnt; + ipc_lock_object(&shp->shm_perm); + + /* check if shm_destroy() is tearing down shp */ + if (!ipc_valid_object(&shp->shm_perm)) { + ipc_unlock_object(&shp->shm_perm); + err = -EIDRM; + goto out_unlock; + } + + path = shp->shm_file->f_path; + path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); - if (!sfd) - goto out_put_dentry; - - err = -ENOMEM; + if (!sfd) { + path_put(&path); + goto out_nattch; + } - file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); - if (!file) - goto out_free; + file = alloc_file(&path, f_mode, + is_file_hugepages(shp->shm_file) ? + &shm_file_operations_huge : + &shm_file_operations); + err = PTR_ERR(file); + if (IS_ERR(file)) { + kfree(sfd); + path_put(&path); + goto out_nattch; + } file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; @@ -983,9 +1157,16 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) sfd->file = shp->shm_file; sfd->vm_ops = NULL; + err = security_mmap_file(file, prot, flags); + if (err) + goto out_fput; + down_write(¤t->mm->mmap_sem); if (addr && !(shmflg & SHM_REMAP)) { err = -EINVAL; + if (addr + size < addr) + goto invalid; + if (find_vma_intersection(current->mm, addr, addr + size)) goto invalid; /* @@ -996,49 +1177,44 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) addr > current->mm->start_stack - size - PAGE_SIZE * 5) goto invalid; } - - user_addr = do_mmap (file, addr, size, prot, flags, 0); - *raddr = user_addr; + + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); + *raddr = addr; err = 0; - if (IS_ERR_VALUE(user_addr)) - err = (long)user_addr; + if (IS_ERR_VALUE(addr)) + err = (long)addr; invalid: up_write(¤t->mm->mmap_sem); + if (populate) + mm_populate(addr, populate); +out_fput: fput(file); out_nattch: - down_write(&shm_ids(ns).rw_mutex); - shp = shm_lock_down(ns, shmid); + down_write(&shm_ids(ns).rwsem); + shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; - if(shp->shm_nattch == 0 && - shp->shm_perm.mode & SHM_DEST) + if (shm_may_destroy(ns, shp)) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); - -out: + up_write(&shm_ids(ns).rwsem); return err; out_unlock: - shm_unlock(shp); - goto out; - -out_free: - kfree(sfd); -out_put_dentry: - dput(path.dentry); - goto out_nattch; + rcu_read_unlock(); +out: + return err; } -asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg) +SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) { unsigned long ret; long err; - err = do_shmat(shmid, shmaddr, shmflg, &ret); + err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA); if (err) return err; force_successful_syscall_return(); @@ -1049,13 +1225,16 @@ asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg) * detach and kill segment if marked destroyed. * The work is done in shm_close. */ -asmlinkage long sys_shmdt(char __user *shmaddr) +SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *next; + struct vm_area_struct *vma; unsigned long addr = (unsigned long)shmaddr; - loff_t size = 0; int retval = -EINVAL; +#ifdef CONFIG_MMU + loff_t size = 0; + struct vm_area_struct *next; +#endif if (addr & ~PAGE_MASK) return retval; @@ -1084,6 +1263,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr) */ vma = find_vma(mm, addr); +#ifdef CONFIG_MMU while (vma) { next = vma->vm_next; @@ -1096,7 +1276,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr) (vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) { - size = vma->vm_file->f_path.dentry->d_inode->i_size; + size = file_inode(vma->vm_file)->i_size; do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); /* * We discovered the size of the shm segment, so @@ -1114,7 +1294,7 @@ asmlinkage long sys_shmdt(char __user *shmaddr) /* * We need look no further than the maximum address a fragment * could possibly have landed at. Also cast things to loff_t to - * prevent overflows and make comparisions vs. equal-width types. + * prevent overflows and make comparisons vs. equal-width types. */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { @@ -1128,6 +1308,16 @@ asmlinkage long sys_shmdt(char __user *shmaddr) vma = next; } +#else /* CONFIG_MMU */ + /* under NOMMU conditions, the exact address to be destroyed must be + * given */ + if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); + retval = 0; + } + +#endif + up_write(&mm->mmap_sem); return retval; } @@ -1135,17 +1325,22 @@ asmlinkage long sys_shmdt(char __user *shmaddr) #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it) { + struct user_namespace *user_ns = seq_user_ns(s); struct shmid_kernel *shp = it; - char *format; + unsigned long rss = 0, swp = 0; -#define SMALL_STRING "%10d %10d %4o %10u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" -#define BIG_STRING "%10d %10d %4o %21u %5u %5u %5d %5u %5u %5u %5u %10lu %10lu %10lu\n" + shm_add_rss_swap(shp, &rss, &swp); - if (sizeof(size_t) <= sizeof(int)) - format = SMALL_STRING; - else - format = BIG_STRING; - return seq_printf(s, format, +#if BITS_PER_LONG <= 32 +#define SIZE_SPEC "%10lu" +#else +#define SIZE_SPEC "%21lu" +#endif + + return seq_printf(s, + "%10d %10d %4o " SIZE_SPEC " %5u %5u " + "%5lu %5u %5u %5u %5u %10lu %10lu %10lu " + SIZE_SPEC " " SIZE_SPEC "\n", shp->shm_perm.key, shp->shm_perm.id, shp->shm_perm.mode, @@ -1153,12 +1348,14 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it) shp->shm_cprid, shp->shm_lprid, shp->shm_nattch, - shp->shm_perm.uid, - shp->shm_perm.gid, - shp->shm_perm.cuid, - shp->shm_perm.cgid, + from_kuid_munged(user_ns, shp->shm_perm.uid), + from_kgid_munged(user_ns, shp->shm_perm.gid), + from_kuid_munged(user_ns, shp->shm_perm.cuid), + from_kgid_munged(user_ns, shp->shm_perm.cgid), shp->shm_atim, shp->shm_dtim, - shp->shm_ctim); + shp->shm_ctim, + rss * PAGE_SIZE, + swp * PAGE_SIZE); } #endif |
