diff options
Diffstat (limited to 'ipc/msg.c')
| -rw-r--r-- | ipc/msg.c | 696 |
1 files changed, 415 insertions, 281 deletions
diff --git a/ipc/msg.c b/ipc/msg.c index 7385de25788..c5d8e374998 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,12 +39,10 @@ #include <linux/ipc_namespace.h> #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "util.h" -/* - * one msg_receiver structure for each sleeping receiver: - */ +/* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; struct task_struct *r_tsk; @@ -53,6 +51,12 @@ struct msg_receiver { long r_msgtype; long r_maxsize; + /* + * Mark r_msg volatile so that the compiler + * does not try to get smart and optimize + * it. We rely on this for the lockless + * receive algorithm. + */ struct msg_msg *volatile r_msg; }; @@ -66,101 +70,27 @@ struct msg_sender { #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 #define SEARCH_LESSEQUAL 4 +#define SEARCH_NUMBER 5 #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) - -static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); -static int newque(struct ipc_namespace *, struct ipc_params *); -#ifdef CONFIG_PROC_FS -static int sysvipc_msg_proc_show(struct seq_file *s, void *it); -#endif - -/* - * Scale msgmni with the available lowmem size: the memory dedicated to msg - * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. - * Also take into account the number of nsproxies created so far. - * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. - */ -void recompute_msgmni(struct ipc_namespace *ns) -{ - struct sysinfo i; - unsigned long allowed; - int nb_ns; - - si_meminfo(&i); - allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) - / MSGMNB; - nb_ns = atomic_read(&nr_ipc_ns); - allowed /= nb_ns; - - if (allowed < MSGMNI) { - ns->msg_ctlmni = MSGMNI; - return; - } - - if (allowed > IPCMNI / nb_ns) { - ns->msg_ctlmni = IPCMNI / nb_ns; - return; - } - - ns->msg_ctlmni = allowed; -} - -void msg_init_ns(struct ipc_namespace *ns) -{ - ns->msg_ctlmax = MSGMAX; - ns->msg_ctlmnb = MSGMNB; - - recompute_msgmni(ns); - - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); - ipc_init_ids(&ns->ids[IPC_MSG_IDS]); -} - -#ifdef CONFIG_IPC_NS -void msg_exit_ns(struct ipc_namespace *ns) +static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id) { - free_ipcs(ns, &msg_ids(ns), freeque); - idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); -} -#endif - -void __init msg_init(void) -{ - msg_init_ns(&init_ipc_ns); - - printk(KERN_INFO "msgmni has been set to %d\n", - init_ipc_ns.msg_ctlmni); - - ipc_init_proc_interface("sysvipc/msg", - " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", - IPC_MSG_IDS, sysvipc_msg_proc_show); -} - -/* - * msg_lock_(check_) routines are called in the paths where the rw_mutex - * is not held. - */ -static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id); + struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id); if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; + return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } -static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, - int id) +static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, + int id) { - struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id); + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id); if (IS_ERR(ipcp)) - return (struct msg_queue *)ipcp; + return ERR_CAST(ipcp); return container_of(ipcp, struct msg_queue, q_perm); } @@ -170,12 +100,21 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) ipc_rmid(&msg_ids(ns), &s->q_perm); } +static void msg_rcu_free(struct rcu_head *head) +{ + struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + struct msg_queue *msq = ipc_rcu_to_struct(p); + + security_msg_queue_free(msq); + ipc_rcu_free(head); +} + /** * newque - Create a new msg queue * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { @@ -194,17 +133,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(msq); if (retval) { - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, ipc_rcu_free); return retval; } - /* - * ipc_addid() locks msq - */ + /* ipc_addid() locks msq upon success. */ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id < 0) { - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); return id; } @@ -217,7 +153,8 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) INIT_LIST_HEAD(&msq->q_receivers); INIT_LIST_HEAD(&msq->q_senders); - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); return msq->q_perm.id; } @@ -225,7 +162,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss) { mss->tsk = current; - current->state = TASK_INTERRUPTIBLE; + __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } @@ -237,14 +174,9 @@ static inline void ss_del(struct msg_sender *mss) static void ss_wakeup(struct list_head *h, int kill) { - struct list_head *tmp; - - tmp = h->next; - while (tmp != h) { - struct msg_sender *mss; + struct msg_sender *mss, *t; - mss = list_entry(tmp, struct msg_sender, list); - tmp = tmp->next; + list_for_each_entry_safe(mss, t, h, list) { if (kill) mss->list.next = NULL; wake_up_process(mss->tsk); @@ -253,16 +185,17 @@ static void ss_wakeup(struct list_head *h, int kill) static void expunge_all(struct msg_queue *msq, int res) { - struct list_head *tmp; + struct msg_receiver *msr, *t; - tmp = msq->q_receivers.next; - while (tmp != &msq->q_receivers) { - struct msg_receiver *msr; - - msr = list_entry(tmp, struct msg_receiver, r_list); - tmp = tmp->next; - msr->r_msg = NULL; + list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { + msr->r_msg = NULL; /* initialize expunge ordering */ wake_up_process(msr->r_tsk); + /* + * Ensure that the wakeup is visible before setting r_msg as + * the receiving end depends on it: either spinning on a nil, + * or dealing with -EAGAIN cases. See lockless receive part 1 + * and 2 in do_msgrcv(). + */ smp_mb(); msr->r_msg = ERR_PTR(res); } @@ -273,34 +206,30 @@ static void expunge_all(struct msg_queue *msq, int res) * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { - struct list_head *tmp; + struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); - msg_unlock(msq); - - tmp = msq->q_messages.next; - while (tmp != &msq->q_messages) { - struct msg_msg *msg = list_entry(tmp, struct msg_msg, m_list); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); - tmp = tmp->next; + list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); free_msg(msg); } atomic_sub(msq->q_cbytes, &ns->msg_bytes); - security_msg_queue_free(msq); - ipc_rcu_putref(msq); + ipc_rcu_putref(msq, msg_rcu_free); } /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked. */ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) { @@ -312,15 +241,14 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) { struct ipc_namespace *ns; - struct ipc_ops msg_ops; + static const struct ipc_ops msg_ops = { + .getnew = newque, + .associate = msg_security, + }; struct ipc_params msg_params; ns = current->nsproxy->ipc_ns; - msg_ops.getnew = newque; - msg_ops.associate = msg_security; - msg_ops.more_checks = NULL; - msg_params.key = key; msg_params.flg = msgflg; @@ -330,7 +258,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg) static inline unsigned long copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) { - switch(version) { + switch (version) { case IPC_64: return copy_to_user(buf, in, sizeof(*in)); case IPC_OLD: @@ -375,7 +303,7 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version) static inline unsigned long copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) { - switch(version) { + switch (version) { case IPC_64: if (copy_from_user(out, buf, sizeof(*out))) return -EFAULT; @@ -387,9 +315,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old))) return -EFAULT; - out->msg_perm.uid = tbuf_old.msg_perm.uid; - out->msg_perm.gid = tbuf_old.msg_perm.gid; - out->msg_perm.mode = tbuf_old.msg_perm.mode; + out->msg_perm.uid = tbuf_old.msg_perm.uid; + out->msg_perm.gid = tbuf_old.msg_perm.gid; + out->msg_perm.mode = tbuf_old.msg_perm.mode; if (tbuf_old.msg_qbytes == 0) out->msg_qbytes = tbuf_old.msg_lqbytes; @@ -404,9 +332,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) } /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct msqid_ds __user *buf, int version) @@ -421,31 +349,42 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - ipcp = ipcctl_pre_down(ns, &msg_ids(ns), msqid, cmd, - &msqid64.msg_perm, msqid64.msg_qbytes); - if (IS_ERR(ipcp)) - return PTR_ERR(ipcp); + down_write(&msg_ids(ns).rwsem); + rcu_read_lock(); + + ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, + &msqid64.msg_perm, msqid64.msg_qbytes); + if (IS_ERR(ipcp)) { + err = PTR_ERR(ipcp); + goto out_unlock1; + } msq = container_of(ipcp, struct msg_queue, q_perm); err = security_msg_queue_msgctl(msq, cmd); if (err) - goto out_unlock; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&msq->q_perm); + /* freeque unlocks the ipc object and rcu */ freeque(ns, ipcp); goto out_up; case IPC_SET: if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; - goto out_unlock; + goto out_unlock1; } + ipc_lock_object(&msq->q_perm); + err = ipc_update_perm(&msqid64.msg_perm, ipcp); + if (err) + goto out_unlock0; + msq->q_qbytes = msqid64.msg_qbytes; - ipc_update_perm(&msqid64.msg_perm, ipcp); msq->q_ctime = get_seconds(); /* sleeping receivers might be excluded by * stricter permissions. @@ -458,25 +397,23 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } -out_unlock: - msg_unlock(msq); + +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); out_up: - up_write(&msg_ids(ns).rw_mutex); + up_write(&msg_ids(ns).rwsem); return err; } -SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +static int msgctl_nolock(struct ipc_namespace *ns, int msqid, + int cmd, int version, void __user *buf) { + int err; struct msg_queue *msq; - int err, version; - struct ipc_namespace *ns; - - if (msqid < 0 || cmd < 0) - return -EINVAL; - - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: @@ -487,6 +424,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) if (!buf) return -EFAULT; + /* * We must not return kernel stack data. * due to padding, it's not enough @@ -502,7 +440,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - down_read(&msg_ids(ns).rw_mutex); + down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -513,12 +451,13 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); - up_read(&msg_ids(ns).rw_mutex); + up_read(&msg_ids(ns).rwsem); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; } - case MSG_STAT: /* msqid is an index rather than a msg queue id */ + + case MSG_STAT: case IPC_STAT: { struct msqid64_ds tbuf; @@ -527,17 +466,25 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) if (!buf) return -EFAULT; + memset(&tbuf, 0, sizeof(tbuf)); + + rcu_read_lock(); if (cmd == MSG_STAT) { - msq = msg_lock(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = msq->q_perm.id; } else { - msq = msg_lock_check(ns, msqid); - if (IS_ERR(msq)) - return PTR_ERR(msq); + msq = msq_obtain_object_check(ns, msqid); + if (IS_ERR(msq)) { + err = PTR_ERR(msq); + goto out_unlock; + } success_return = 0; } + err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IRUGO)) goto out_unlock; @@ -546,8 +493,6 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) if (err) goto out_unlock; - memset(&tbuf, 0, sizeof(tbuf)); - kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm); tbuf.msg_stime = msq->q_stime; tbuf.msg_rtime = msq->q_rtime; @@ -557,71 +502,97 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) tbuf.msg_qbytes = msq->q_qbytes; tbuf.msg_lspid = msq->q_lspid; tbuf.msg_lrpid = msq->q_lrpid; - msg_unlock(msq); + rcu_read_unlock(); + if (copy_msqid_to_user(buf, &tbuf, version)) return -EFAULT; return success_return; } - case IPC_SET: - case IPC_RMID: - err = msgctl_down(ns, msqid, cmd, buf, version); - return err; + default: - return -EINVAL; + return -EINVAL; } + return err; out_unlock: - msg_unlock(msq); + rcu_read_unlock(); return err; } +SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +{ + int version; + struct ipc_namespace *ns; + + if (msqid < 0 || cmd < 0) + return -EINVAL; + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case MSG_INFO: + case MSG_STAT: /* msqid is an index rather than a msg queue id */ + case IPC_STAT: + return msgctl_nolock(ns, msqid, cmd, version, buf); + case IPC_SET: + case IPC_RMID: + return msgctl_down(ns, msqid, cmd, buf, version); + default: + return -EINVAL; + } +} + static int testmsg(struct msg_msg *msg, long type, int mode) { - switch(mode) - { - case SEARCH_ANY: + switch (mode) { + case SEARCH_ANY: + case SEARCH_NUMBER: + return 1; + case SEARCH_LESSEQUAL: + if (msg->m_type <= type) return 1; - case SEARCH_LESSEQUAL: - if (msg->m_type <=type) - return 1; - break; - case SEARCH_EQUAL: - if (msg->m_type == type) - return 1; - break; - case SEARCH_NOTEQUAL: - if (msg->m_type != type) - return 1; - break; + break; + case SEARCH_EQUAL: + if (msg->m_type == type) + return 1; + break; + case SEARCH_NOTEQUAL: + if (msg->m_type != type) + return 1; + break; } return 0; } static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) { - struct list_head *tmp; - - tmp = msq->q_receivers.next; - while (tmp != &msq->q_receivers) { - struct msg_receiver *msr; + struct msg_receiver *msr, *t; - msr = list_entry(tmp, struct msg_receiver, r_list); - tmp = tmp->next; + list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { if (testmsg(msg, msr->r_msgtype, msr->r_mode) && !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) { list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { + /* initialize pipelined send ordering */ msr->r_msg = NULL; wake_up_process(msr->r_tsk); - smp_mb(); + smp_mb(); /* see barrier comment below */ msr->r_msg = ERR_PTR(-E2BIG); } else { msr->r_msg = NULL; msq->q_lrpid = task_pid_vnr(msr->r_tsk); msq->q_rtime = get_seconds(); wake_up_process(msr->r_tsk); + /* + * Ensure that the wakeup is visible before + * setting r_msg, as the receiving end depends + * on it. See lockless receive part 1 and 2 in + * do_msgrcv(). + */ smp_mb(); msr->r_msg = msg; @@ -629,6 +600,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg) } } } + return 0; } @@ -654,22 +626,31 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock_check(ns, msqid); + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { err = PTR_ERR(msq); - goto out_free; + goto out_unlock1; } + ipc_lock_object(&msq->q_perm); + for (;;) { struct msg_sender s; err = -EACCES; if (ipcperms(ns, &msq->q_perm, S_IWUGO)) - goto out_unlock_free; + goto out_unlock0; + + /* raced with RMID? */ + if (!ipc_valid_object(&msq->q_perm)) { + err = -EIDRM; + goto out_unlock0; + } err = security_msg_queue_msgsnd(msq, msg, msgflg); if (err) - goto out_unlock_free; + goto out_unlock0; if (msgsz + msq->q_cbytes <= msq->q_qbytes && 1 + msq->q_qnum <= msq->q_qbytes) { @@ -679,27 +660,39 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, /* queue full, wait: */ if (msgflg & IPC_NOWAIT) { err = -EAGAIN; - goto out_unlock_free; + goto out_unlock0; } + + /* enqueue the sender and prepare to block */ ss_add(msq, &s); - ipc_rcu_getref(msq); - msg_unlock(msq); + + if (!ipc_rcu_getref(msq)) { + err = -EIDRM; + goto out_unlock0; + } + + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); - ipc_lock_by_ptr(&msq->q_perm); - ipc_rcu_putref(msq); - if (msq->q_perm.deleted) { + rcu_read_lock(); + ipc_lock_object(&msq->q_perm); + + ipc_rcu_putref(msq, ipc_rcu_free); + /* raced with RMID? */ + if (!ipc_valid_object(&msq->q_perm)) { err = -EIDRM; - goto out_unlock_free; + goto out_unlock0; } + ss_del(&s); if (signal_pending(current)) { err = -ERESTARTNOHAND; - goto out_unlock_free; + goto out_unlock0; } - } + } msq->q_lspid = task_tgid_vnr(current); msq->q_stime = get_seconds(); @@ -715,9 +708,10 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, err = 0; msg = NULL; -out_unlock_free: - msg_unlock(msq); -out_free: +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); if (msg != NULL) free_msg(msg); return err; @@ -735,6 +729,8 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, static inline int convert_mode(long *msgtyp, int msgflg) { + if (msgflg & MSG_COPY) + return SEARCH_NUMBER; /* * find message of correct type. * msgtyp = 0 => get first. @@ -752,62 +748,142 @@ static inline int convert_mode(long *msgtyp, int msgflg) return SEARCH_EQUAL; } -long do_msgrcv(int msqid, long *pmtype, void __user *mtext, - size_t msgsz, long msgtyp, int msgflg) +static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ + struct msgbuf __user *msgp = dest; + size_t msgsz; + + if (put_user(msg->m_type, &msgp->mtype)) + return -EFAULT; + + msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; + if (store_msg(msgp->mtext, msg, msgsz)) + return -EFAULT; + return msgsz; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * This function creates new kernel message structure, large enough to store + * bufsz message bytes. + */ +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) +{ + struct msg_msg *copy; + + /* + * Create dummy message to copy real message to. + */ + copy = load_msg(buf, bufsz); + if (!IS_ERR(copy)) + copy->m_ts = bufsz; + return copy; +} + +static inline void free_copy(struct msg_msg *copy) +{ + if (copy) + free_msg(copy); +} +#else +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void free_copy(struct msg_msg *copy) +{ +} +#endif + +static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) +{ + struct msg_msg *msg, *found = NULL; + long count = 0; + + list_for_each_entry(msg, &msq->q_messages, m_list) { + if (testmsg(msg, *msgtyp, mode) && + !security_msg_queue_msgrcv(msq, msg, current, + *msgtyp, mode)) { + if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { + *msgtyp = msg->m_type - 1; + found = msg; + } else if (mode == SEARCH_NUMBER) { + if (*msgtyp == count) + return msg; + } else + return msg; + count++; + } + } + + return found ?: ERR_PTR(-EAGAIN); +} + +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, + long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { - struct msg_queue *msq; - struct msg_msg *msg; int mode; + struct msg_queue *msq; struct ipc_namespace *ns; + struct msg_msg *msg, *copy = NULL; + + ns = current->nsproxy->ipc_ns; - if (msqid < 0 || (long) msgsz < 0) + if (msqid < 0 || (long) bufsz < 0) return -EINVAL; + + if (msgflg & MSG_COPY) { + if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) + return -EINVAL; + copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); + if (IS_ERR(copy)) + return PTR_ERR(copy); + } mode = convert_mode(&msgtyp, msgflg); - ns = current->nsproxy->ipc_ns; - msq = msg_lock_check(ns, msqid); - if (IS_ERR(msq)) + rcu_read_lock(); + msq = msq_obtain_object_check(ns, msqid); + if (IS_ERR(msq)) { + rcu_read_unlock(); + free_copy(copy); return PTR_ERR(msq); + } for (;;) { struct msg_receiver msr_d; - struct list_head *tmp; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) - goto out_unlock; + goto out_unlock1; - msg = ERR_PTR(-EAGAIN); - tmp = msq->q_messages.next; - while (tmp != &msq->q_messages) { - struct msg_msg *walk_msg; - - walk_msg = list_entry(tmp, struct msg_msg, m_list); - if (testmsg(walk_msg, msgtyp, mode) && - !security_msg_queue_msgrcv(msq, walk_msg, current, - msgtyp, mode)) { - - msg = walk_msg; - if (mode == SEARCH_LESSEQUAL && - walk_msg->m_type != 1) { - msg = walk_msg; - msgtyp = walk_msg->m_type - 1; - } else { - msg = walk_msg; - break; - } - } - tmp = tmp->next; + ipc_lock_object(&msq->q_perm); + + /* raced with RMID? */ + if (!ipc_valid_object(&msq->q_perm)) { + msg = ERR_PTR(-EIDRM); + goto out_unlock0; } + + msg = find_msg(msq, &msgtyp, mode); if (!IS_ERR(msg)) { /* * Found a suitable message. * Unlink it from the queue. */ - if ((msgsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { + if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); - goto out_unlock; + goto out_unlock0; } + /* + * If we are copying, then do not unlink message and do + * not update queue parameters. + */ + if (msgflg & MSG_COPY) { + msg = copy_msg(msg, copy); + goto out_unlock0; + } + list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); @@ -816,14 +892,16 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, atomic_sub(msg->m_ts, &ns->msg_bytes); atomic_dec(&ns->msg_hdrs); ss_wakeup(&msq->q_senders, 0); - msg_unlock(msq); - break; + + goto out_unlock0; } + /* No message waiting. Wait for a message */ if (msgflg & IPC_NOWAIT) { msg = ERR_PTR(-ENOMSG); - goto out_unlock; + goto out_unlock0; } + list_add_tail(&msr_d.r_list, &msq->q_receivers); msr_d.r_tsk = current; msr_d.r_msgtype = msgtyp; @@ -831,11 +909,12 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else - msr_d.r_maxsize = msgsz; + msr_d.r_maxsize = bufsz; msr_d.r_msg = ERR_PTR(-EAGAIN); - current->state = TASK_INTERRUPTIBLE; - msg_unlock(msq); + __set_current_state(TASK_INTERRUPTIBLE); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); schedule(); /* Lockless receive, part 1: @@ -846,7 +925,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, * Prior to destruction, expunge_all(-EIRDM) changes r_msg. * Thus if r_msg is -EAGAIN, then the queue not yet destroyed. * rcu_read_lock() prevents preemption between reading r_msg - * and the spin_lock() inside ipc_lock_by_ptr(). + * and acquiring the q_perm.lock in ipc_lock_object(). */ rcu_read_lock(); @@ -855,7 +934,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, * wake_up_process(). There is a race with exit(), see * ipc/mqueue.c for the details. */ - msg = (struct msg_msg*)msr_d.r_msg; + msg = (struct msg_msg *)msr_d.r_msg; while (msg == NULL) { cpu_relax(); msg = (struct msg_msg *)msr_d.r_msg; @@ -865,63 +944,106 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, * If there is a message or an error then accept it without * locking. */ - if (msg != ERR_PTR(-EAGAIN)) { - rcu_read_unlock(); - break; - } + if (msg != ERR_PTR(-EAGAIN)) + goto out_unlock1; /* Lockless receive, part 3: * Acquire the queue spinlock. */ - ipc_lock_by_ptr(&msq->q_perm); - rcu_read_unlock(); + ipc_lock_object(&msq->q_perm); /* Lockless receive, part 4: * Repeat test after acquiring the spinlock. */ - msg = (struct msg_msg*)msr_d.r_msg; + msg = (struct msg_msg *)msr_d.r_msg; if (msg != ERR_PTR(-EAGAIN)) - goto out_unlock; + goto out_unlock0; list_del(&msr_d.r_list); if (signal_pending(current)) { msg = ERR_PTR(-ERESTARTNOHAND); -out_unlock: - msg_unlock(msq); - break; + goto out_unlock0; } + + ipc_unlock_object(&msq->q_perm); } - if (IS_ERR(msg)) - return PTR_ERR(msg); - msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz; - *pmtype = msg->m_type; - if (store_msg(mtext, msg, msgsz)) - msgsz = -EFAULT; +out_unlock0: + ipc_unlock_object(&msq->q_perm); +out_unlock1: + rcu_read_unlock(); + if (IS_ERR(msg)) { + free_copy(copy); + return PTR_ERR(msg); + } + bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); - return msgsz; + return bufsz; } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { - long err, mtype; + return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); +} - err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); - if (err < 0) - goto out; +/* + * Scale msgmni with the available lowmem size: the memory dedicated to msg + * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. + * Also take into account the number of nsproxies created so far. + * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. + */ +void recompute_msgmni(struct ipc_namespace *ns) +{ + struct sysinfo i; + unsigned long allowed; + int nb_ns; - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; -out: - return err; + si_meminfo(&i); + allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) + / MSGMNB; + nb_ns = atomic_read(&nr_ipc_ns); + allowed /= nb_ns; + + if (allowed < MSGMNI) { + ns->msg_ctlmni = MSGMNI; + return; + } + + if (allowed > IPCMNI / nb_ns) { + ns->msg_ctlmni = IPCMNI / nb_ns; + return; + } + + ns->msg_ctlmni = allowed; +} + +void msg_init_ns(struct ipc_namespace *ns) +{ + ns->msg_ctlmax = MSGMAX; + ns->msg_ctlmnb = MSGMNB; + + recompute_msgmni(ns); + + atomic_set(&ns->msg_bytes, 0); + atomic_set(&ns->msg_hdrs, 0); + ipc_init_ids(&ns->ids[IPC_MSG_IDS]); +} + +#ifdef CONFIG_IPC_NS +void msg_exit_ns(struct ipc_namespace *ns) +{ + free_ipcs(ns, &msg_ids(ns), freeque); + idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); } +#endif #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it) { + struct user_namespace *user_ns = seq_user_ns(s); struct msg_queue *msq = it; return seq_printf(s, @@ -933,12 +1055,24 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) msq->q_qnum, msq->q_lspid, msq->q_lrpid, - msq->q_perm.uid, - msq->q_perm.gid, - msq->q_perm.cuid, - msq->q_perm.cgid, + from_kuid_munged(user_ns, msq->q_perm.uid), + from_kgid_munged(user_ns, msq->q_perm.gid), + from_kuid_munged(user_ns, msq->q_perm.cuid), + from_kgid_munged(user_ns, msq->q_perm.cgid), msq->q_stime, msq->q_rtime, msq->q_ctime); } #endif + +void __init msg_init(void) +{ + msg_init_ns(&init_ipc_ns); + + printk(KERN_INFO "msgmni has been set to %d\n", + init_ipc_ns.msg_ctlmni); + + ipc_init_proc_interface("sysvipc/msg", + " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", + IPC_MSG_IDS, sysvipc_msg_proc_show); +} |
