diff options
Diffstat (limited to 'ipc/msg.c')
| -rw-r--r-- | ipc/msg.c | 706 | 
1 files changed, 420 insertions, 286 deletions
diff --git a/ipc/msg.c b/ipc/msg.c index 747b65507a9..c5d8e374998 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,12 +39,10 @@  #include <linux/ipc_namespace.h>  #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h>  #include "util.h" -/* - * one msg_receiver structure for each sleeping receiver: - */ +/* one msg_receiver structure for each sleeping receiver */  struct msg_receiver {  	struct list_head	r_list;  	struct task_struct	*r_tsk; @@ -53,6 +51,12 @@ struct msg_receiver {  	long			r_msgtype;  	long			r_maxsize; +	/* +	 * Mark r_msg volatile so that the compiler +	 * does not try to get smart and optimize +	 * it. We rely on this for the lockless +	 * receive algorithm. +	 */  	struct msg_msg		*volatile r_msg;  }; @@ -66,101 +70,27 @@ struct msg_sender {  #define SEARCH_EQUAL		2  #define SEARCH_NOTEQUAL		3  #define SEARCH_LESSEQUAL	4 +#define SEARCH_NUMBER		5  #define msg_ids(ns)	((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq)		ipc_unlock(&(msq)->q_perm) - -static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); -static int newque(struct ipc_namespace *, struct ipc_params *); -#ifdef CONFIG_PROC_FS -static int sysvipc_msg_proc_show(struct seq_file *s, void *it); -#endif - -/* - * Scale msgmni with the available lowmem size: the memory dedicated to msg - * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. - * Also take into account the number of nsproxies created so far. - * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. - */ -void recompute_msgmni(struct ipc_namespace *ns) -{ -	struct sysinfo i; -	unsigned long allowed; -	int nb_ns; - -	si_meminfo(&i); -	allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) -		/ MSGMNB; -	nb_ns = atomic_read(&nr_ipc_ns); -	allowed /= nb_ns; - -	if (allowed < MSGMNI) { -		ns->msg_ctlmni = MSGMNI; -		return; -	} - -	if (allowed > IPCMNI / nb_ns) { -		ns->msg_ctlmni = IPCMNI / nb_ns; -		return; -	} - -	ns->msg_ctlmni = allowed; -} - -void msg_init_ns(struct ipc_namespace *ns) -{ -	ns->msg_ctlmax = MSGMAX; -	ns->msg_ctlmnb = MSGMNB; - -	recompute_msgmni(ns); - -	atomic_set(&ns->msg_bytes, 0); -	atomic_set(&ns->msg_hdrs, 0); -	ipc_init_ids(&ns->ids[IPC_MSG_IDS]); -} - -#ifdef CONFIG_IPC_NS -void msg_exit_ns(struct ipc_namespace *ns) +static inline struct msg_queue *msq_obtain_object(struct ipc_namespace *ns, int id)  { -	free_ipcs(ns, &msg_ids(ns), freeque); -	idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); -} -#endif - -void __init msg_init(void) -{ -	msg_init_ns(&init_ipc_ns); - -	printk(KERN_INFO "msgmni has been set to %d\n", -		init_ipc_ns.msg_ctlmni); - -	ipc_init_proc_interface("sysvipc/msg", -				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n", -				IPC_MSG_IDS, sysvipc_msg_proc_show); -} - -/* - * msg_lock_(check_) routines are called in the paths where the rw_mutex - * is not held. - */ -static inline struct msg_queue *msg_lock(struct ipc_namespace *ns, int id) -{ -	struct kern_ipc_perm *ipcp = ipc_lock(&msg_ids(ns), id); +	struct kern_ipc_perm *ipcp = ipc_obtain_object(&msg_ids(ns), id);  	if (IS_ERR(ipcp)) -		return (struct msg_queue *)ipcp; +		return ERR_CAST(ipcp);  	return container_of(ipcp, struct msg_queue, q_perm);  } -static inline struct msg_queue *msg_lock_check(struct ipc_namespace *ns, -						int id) +static inline struct msg_queue *msq_obtain_object_check(struct ipc_namespace *ns, +							int id)  { -	struct kern_ipc_perm *ipcp = ipc_lock_check(&msg_ids(ns), id); +	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&msg_ids(ns), id);  	if (IS_ERR(ipcp)) -		return (struct msg_queue *)ipcp; +		return ERR_CAST(ipcp);  	return container_of(ipcp, struct msg_queue, q_perm);  } @@ -170,12 +100,21 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)  	ipc_rmid(&msg_ids(ns), &s->q_perm);  } +static void msg_rcu_free(struct rcu_head *head) +{ +	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); +	struct msg_queue *msq = ipc_rcu_to_struct(p); + +	security_msg_queue_free(msq); +	ipc_rcu_free(head); +} +  /**   * newque - Create a new msg queue   * @ns: namespace   * @params: ptr to the structure that contains the key and msgflg   * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer)   */  static int newque(struct ipc_namespace *ns, struct ipc_params *params)  { @@ -194,17 +133,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)  	msq->q_perm.security = NULL;  	retval = security_msg_queue_alloc(msq);  	if (retval) { -		ipc_rcu_putref(msq); +		ipc_rcu_putref(msq, ipc_rcu_free);  		return retval;  	} -	/* -	 * ipc_addid() locks msq -	 */ +	/* ipc_addid() locks msq upon success. */  	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);  	if (id < 0) { -		security_msg_queue_free(msq); -		ipc_rcu_putref(msq); +		ipc_rcu_putref(msq, msg_rcu_free);  		return id;  	} @@ -217,7 +153,8 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)  	INIT_LIST_HEAD(&msq->q_receivers);  	INIT_LIST_HEAD(&msq->q_senders); -	msg_unlock(msq); +	ipc_unlock_object(&msq->q_perm); +	rcu_read_unlock();  	return msq->q_perm.id;  } @@ -225,7 +162,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)  static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss)  {  	mss->tsk = current; -	current->state = TASK_INTERRUPTIBLE; +	__set_current_state(TASK_INTERRUPTIBLE);  	list_add_tail(&mss->list, &msq->q_senders);  } @@ -237,14 +174,9 @@ static inline void ss_del(struct msg_sender *mss)  static void ss_wakeup(struct list_head *h, int kill)  { -	struct list_head *tmp; - -	tmp = h->next; -	while (tmp != h) { -		struct msg_sender *mss; +	struct msg_sender *mss, *t; -		mss = list_entry(tmp, struct msg_sender, list); -		tmp = tmp->next; +	list_for_each_entry_safe(mss, t, h, list) {  		if (kill)  			mss->list.next = NULL;  		wake_up_process(mss->tsk); @@ -253,16 +185,17 @@ static void ss_wakeup(struct list_head *h, int kill)  static void expunge_all(struct msg_queue *msq, int res)  { -	struct list_head *tmp; +	struct msg_receiver *msr, *t; -	tmp = msq->q_receivers.next; -	while (tmp != &msq->q_receivers) { -		struct msg_receiver *msr; - -		msr = list_entry(tmp, struct msg_receiver, r_list); -		tmp = tmp->next; -		msr->r_msg = NULL; +	list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { +		msr->r_msg = NULL; /* initialize expunge ordering */  		wake_up_process(msr->r_tsk); +		/* +		 * Ensure that the wakeup is visible before setting r_msg as +		 * the receiving end depends on it: either spinning on a nil, +		 * or dealing with -EAGAIN cases. See lockless receive part 1 +		 * and 2 in do_msgrcv(). +		 */  		smp_mb();  		msr->r_msg = ERR_PTR(res);  	} @@ -273,34 +206,30 @@ static void expunge_all(struct msg_queue *msq, int res)   * removes the message queue from message queue ID IDR, and cleans up all the   * messages associated with this queue.   * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit.   */  static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)  { -	struct list_head *tmp; +	struct msg_msg *msg, *t;  	struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);  	expunge_all(msq, -EIDRM);  	ss_wakeup(&msq->q_senders, 1);  	msg_rmid(ns, msq); -	msg_unlock(msq); - -	tmp = msq->q_messages.next; -	while (tmp != &msq->q_messages) { -		struct msg_msg *msg = list_entry(tmp, struct msg_msg, m_list); +	ipc_unlock_object(&msq->q_perm); +	rcu_read_unlock(); -		tmp = tmp->next; +	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {  		atomic_dec(&ns->msg_hdrs);  		free_msg(msg);  	}  	atomic_sub(msq->q_cbytes, &ns->msg_bytes); -	security_msg_queue_free(msq); -	ipc_rcu_putref(msq); +	ipc_rcu_putref(msq, msg_rcu_free);  }  /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked.   */  static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)  { @@ -312,15 +241,14 @@ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)  SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)  {  	struct ipc_namespace *ns; -	struct ipc_ops msg_ops; +	static const struct ipc_ops msg_ops = { +		.getnew = newque, +		.associate = msg_security, +	};  	struct ipc_params msg_params;  	ns = current->nsproxy->ipc_ns; -	msg_ops.getnew = newque; -	msg_ops.associate = msg_security; -	msg_ops.more_checks = NULL; -  	msg_params.key = key;  	msg_params.flg = msgflg; @@ -330,7 +258,7 @@ SYSCALL_DEFINE2(msgget, key_t, key, int, msgflg)  static inline unsigned long  copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)  { -	switch(version) { +	switch (version) {  	case IPC_64:  		return copy_to_user(buf, in, sizeof(*in));  	case IPC_OLD: @@ -375,7 +303,7 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)  static inline unsigned long  copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)  { -	switch(version) { +	switch (version) {  	case IPC_64:  		if (copy_from_user(out, buf, sizeof(*out)))  			return -EFAULT; @@ -387,9 +315,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)  		if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))  			return -EFAULT; -		out->msg_perm.uid      	= tbuf_old.msg_perm.uid; -		out->msg_perm.gid      	= tbuf_old.msg_perm.gid; -		out->msg_perm.mode     	= tbuf_old.msg_perm.mode; +		out->msg_perm.uid	= tbuf_old.msg_perm.uid; +		out->msg_perm.gid	= tbuf_old.msg_perm.gid; +		out->msg_perm.mode	= tbuf_old.msg_perm.mode;  		if (tbuf_old.msg_qbytes == 0)  			out->msg_qbytes	= tbuf_old.msg_lqbytes; @@ -404,9 +332,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)  }  /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem   * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function.   */  static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,  		       struct msqid_ds __user *buf, int version) @@ -421,31 +349,42 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,  			return -EFAULT;  	} -	ipcp = ipcctl_pre_down(&msg_ids(ns), msqid, cmd, -			       &msqid64.msg_perm, msqid64.msg_qbytes); -	if (IS_ERR(ipcp)) -		return PTR_ERR(ipcp); +	down_write(&msg_ids(ns).rwsem); +	rcu_read_lock(); + +	ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, +				      &msqid64.msg_perm, msqid64.msg_qbytes); +	if (IS_ERR(ipcp)) { +		err = PTR_ERR(ipcp); +		goto out_unlock1; +	}  	msq = container_of(ipcp, struct msg_queue, q_perm);  	err = security_msg_queue_msgctl(msq, cmd);  	if (err) -		goto out_unlock; +		goto out_unlock1;  	switch (cmd) {  	case IPC_RMID: +		ipc_lock_object(&msq->q_perm); +		/* freeque unlocks the ipc object and rcu */  		freeque(ns, ipcp);  		goto out_up;  	case IPC_SET:  		if (msqid64.msg_qbytes > ns->msg_ctlmnb &&  		    !capable(CAP_SYS_RESOURCE)) {  			err = -EPERM; -			goto out_unlock; +			goto out_unlock1;  		} +		ipc_lock_object(&msq->q_perm); +		err = ipc_update_perm(&msqid64.msg_perm, ipcp); +		if (err) +			goto out_unlock0; +  		msq->q_qbytes = msqid64.msg_qbytes; -		ipc_update_perm(&msqid64.msg_perm, ipcp);  		msq->q_ctime = get_seconds();  		/* sleeping receivers might be excluded by  		 * stricter permissions. @@ -458,25 +397,23 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,  		break;  	default:  		err = -EINVAL; +		goto out_unlock1;  	} -out_unlock: -	msg_unlock(msq); + +out_unlock0: +	ipc_unlock_object(&msq->q_perm); +out_unlock1: +	rcu_read_unlock();  out_up: -	up_write(&msg_ids(ns).rw_mutex); +	up_write(&msg_ids(ns).rwsem);  	return err;  } -SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +static int msgctl_nolock(struct ipc_namespace *ns, int msqid, +			 int cmd, int version, void __user *buf)  { +	int err;  	struct msg_queue *msq; -	int err, version; -	struct ipc_namespace *ns; - -	if (msqid < 0 || cmd < 0) -		return -EINVAL; - -	version = ipc_parse_version(&cmd); -	ns = current->nsproxy->ipc_ns;  	switch (cmd) {  	case IPC_INFO: @@ -487,6 +424,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)  		if (!buf)  			return -EFAULT; +  		/*  		 * We must not return kernel stack data.  		 * due to padding, it's not enough @@ -502,7 +440,7 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)  		msginfo.msgmnb = ns->msg_ctlmnb;  		msginfo.msgssz = MSGSSZ;  		msginfo.msgseg = MSGSEG; -		down_read(&msg_ids(ns).rw_mutex); +		down_read(&msg_ids(ns).rwsem);  		if (cmd == MSG_INFO) {  			msginfo.msgpool = msg_ids(ns).in_use;  			msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -513,12 +451,13 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)  			msginfo.msgtql = MSGTQL;  		}  		max_id = ipc_get_maxid(&msg_ids(ns)); -		up_read(&msg_ids(ns).rw_mutex); +		up_read(&msg_ids(ns).rwsem);  		if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))  			return -EFAULT;  		return (max_id < 0) ? 0 : max_id;  	} -	case MSG_STAT:	/* msqid is an index rather than a msg queue id */ + +	case MSG_STAT:  	case IPC_STAT:  	{  		struct msqid64_ds tbuf; @@ -527,27 +466,33 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)  		if (!buf)  			return -EFAULT; +		memset(&tbuf, 0, sizeof(tbuf)); + +		rcu_read_lock();  		if (cmd == MSG_STAT) { -			msq = msg_lock(ns, msqid); -			if (IS_ERR(msq)) -				return PTR_ERR(msq); +			msq = msq_obtain_object(ns, msqid); +			if (IS_ERR(msq)) { +				err = PTR_ERR(msq); +				goto out_unlock; +			}  			success_return = msq->q_perm.id;  		} else { -			msq = msg_lock_check(ns, msqid); -			if (IS_ERR(msq)) -				return PTR_ERR(msq); +			msq = msq_obtain_object_check(ns, msqid); +			if (IS_ERR(msq)) { +				err = PTR_ERR(msq); +				goto out_unlock; +			}  			success_return = 0;  		} +  		err = -EACCES; -		if (ipcperms(&msq->q_perm, S_IRUGO)) +		if (ipcperms(ns, &msq->q_perm, S_IRUGO))  			goto out_unlock;  		err = security_msg_queue_msgctl(msq, cmd);  		if (err)  			goto out_unlock; -		memset(&tbuf, 0, sizeof(tbuf)); -  		kernel_to_ipc64_perm(&msq->q_perm, &tbuf.msg_perm);  		tbuf.msg_stime  = msq->q_stime;  		tbuf.msg_rtime  = msq->q_rtime; @@ -557,71 +502,97 @@ SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf)  		tbuf.msg_qbytes = msq->q_qbytes;  		tbuf.msg_lspid  = msq->q_lspid;  		tbuf.msg_lrpid  = msq->q_lrpid; -		msg_unlock(msq); +		rcu_read_unlock(); +  		if (copy_msqid_to_user(buf, &tbuf, version))  			return -EFAULT;  		return success_return;  	} -	case IPC_SET: -	case IPC_RMID: -		err = msgctl_down(ns, msqid, cmd, buf, version); -		return err; +  	default: -		return  -EINVAL; +		return -EINVAL;  	} +	return err;  out_unlock: -	msg_unlock(msq); +	rcu_read_unlock();  	return err;  } +SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +{ +	int version; +	struct ipc_namespace *ns; + +	if (msqid < 0 || cmd < 0) +		return -EINVAL; + +	version = ipc_parse_version(&cmd); +	ns = current->nsproxy->ipc_ns; + +	switch (cmd) { +	case IPC_INFO: +	case MSG_INFO: +	case MSG_STAT:	/* msqid is an index rather than a msg queue id */ +	case IPC_STAT: +		return msgctl_nolock(ns, msqid, cmd, version, buf); +	case IPC_SET: +	case IPC_RMID: +		return msgctl_down(ns, msqid, cmd, buf, version); +	default: +		return  -EINVAL; +	} +} +  static int testmsg(struct msg_msg *msg, long type, int mode)  { -	switch(mode) -	{ -		case SEARCH_ANY: +	switch (mode) { +	case SEARCH_ANY: +	case SEARCH_NUMBER: +		return 1; +	case SEARCH_LESSEQUAL: +		if (msg->m_type <= type)  			return 1; -		case SEARCH_LESSEQUAL: -			if (msg->m_type <=type) -				return 1; -			break; -		case SEARCH_EQUAL: -			if (msg->m_type == type) -				return 1; -			break; -		case SEARCH_NOTEQUAL: -			if (msg->m_type != type) -				return 1; -			break; +		break; +	case SEARCH_EQUAL: +		if (msg->m_type == type) +			return 1; +		break; +	case SEARCH_NOTEQUAL: +		if (msg->m_type != type) +			return 1; +		break;  	}  	return 0;  }  static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)  { -	struct list_head *tmp; - -	tmp = msq->q_receivers.next; -	while (tmp != &msq->q_receivers) { -		struct msg_receiver *msr; +	struct msg_receiver *msr, *t; -		msr = list_entry(tmp, struct msg_receiver, r_list); -		tmp = tmp->next; +	list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) {  		if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&  		    !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,  					       msr->r_msgtype, msr->r_mode)) {  			list_del(&msr->r_list);  			if (msr->r_maxsize < msg->m_ts) { +				/* initialize pipelined send ordering */  				msr->r_msg = NULL;  				wake_up_process(msr->r_tsk); -				smp_mb(); +				smp_mb(); /* see barrier comment below */  				msr->r_msg = ERR_PTR(-E2BIG);  			} else {  				msr->r_msg = NULL;  				msq->q_lrpid = task_pid_vnr(msr->r_tsk);  				msq->q_rtime = get_seconds();  				wake_up_process(msr->r_tsk); +				/* +				 * Ensure that the wakeup is visible before +				 * setting r_msg, as the receiving end depends +				 * on it. See lockless receive part 1 and 2 in +				 * do_msgrcv(). +				 */  				smp_mb();  				msr->r_msg = msg; @@ -629,6 +600,7 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg)  			}  		}  	} +  	return 0;  } @@ -654,22 +626,31 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,  	msg->m_type = mtype;  	msg->m_ts = msgsz; -	msq = msg_lock_check(ns, msqid); +	rcu_read_lock(); +	msq = msq_obtain_object_check(ns, msqid);  	if (IS_ERR(msq)) {  		err = PTR_ERR(msq); -		goto out_free; +		goto out_unlock1;  	} +	ipc_lock_object(&msq->q_perm); +  	for (;;) {  		struct msg_sender s;  		err = -EACCES; -		if (ipcperms(&msq->q_perm, S_IWUGO)) -			goto out_unlock_free; +		if (ipcperms(ns, &msq->q_perm, S_IWUGO)) +			goto out_unlock0; + +		/* raced with RMID? */ +		if (!ipc_valid_object(&msq->q_perm)) { +			err = -EIDRM; +			goto out_unlock0; +		}  		err = security_msg_queue_msgsnd(msq, msg, msgflg);  		if (err) -			goto out_unlock_free; +			goto out_unlock0;  		if (msgsz + msq->q_cbytes <= msq->q_qbytes &&  				1 + msq->q_qnum <= msq->q_qbytes) { @@ -679,32 +660,44 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,  		/* queue full, wait: */  		if (msgflg & IPC_NOWAIT) {  			err = -EAGAIN; -			goto out_unlock_free; +			goto out_unlock0;  		} + +		/* enqueue the sender and prepare to block */  		ss_add(msq, &s); -		ipc_rcu_getref(msq); -		msg_unlock(msq); + +		if (!ipc_rcu_getref(msq)) { +			err = -EIDRM; +			goto out_unlock0; +		} + +		ipc_unlock_object(&msq->q_perm); +		rcu_read_unlock();  		schedule(); -		ipc_lock_by_ptr(&msq->q_perm); -		ipc_rcu_putref(msq); -		if (msq->q_perm.deleted) { +		rcu_read_lock(); +		ipc_lock_object(&msq->q_perm); + +		ipc_rcu_putref(msq, ipc_rcu_free); +		/* raced with RMID? */ +		if (!ipc_valid_object(&msq->q_perm)) {  			err = -EIDRM; -			goto out_unlock_free; +			goto out_unlock0;  		} +  		ss_del(&s);  		if (signal_pending(current)) {  			err = -ERESTARTNOHAND; -			goto out_unlock_free; +			goto out_unlock0;  		} -	} +	}  	msq->q_lspid = task_tgid_vnr(current);  	msq->q_stime = get_seconds();  	if (!pipelined_send(msq, msg)) { -		/* noone is waiting for this message, enqueue it */ +		/* no one is waiting for this message, enqueue it */  		list_add_tail(&msg->m_list, &msq->q_messages);  		msq->q_cbytes += msgsz;  		msq->q_qnum++; @@ -715,9 +708,10 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,  	err = 0;  	msg = NULL; -out_unlock_free: -	msg_unlock(msq); -out_free: +out_unlock0: +	ipc_unlock_object(&msq->q_perm); +out_unlock1: +	rcu_read_unlock();  	if (msg != NULL)  		free_msg(msg);  	return err; @@ -735,6 +729,8 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,  static inline int convert_mode(long *msgtyp, int msgflg)  { +	if (msgflg & MSG_COPY) +		return SEARCH_NUMBER;  	/*  	 *  find message of correct type.  	 *  msgtyp = 0 => get first. @@ -752,62 +748,142 @@ static inline int convert_mode(long *msgtyp, int msgflg)  	return SEARCH_EQUAL;  } -long do_msgrcv(int msqid, long *pmtype, void __user *mtext, -		size_t msgsz, long msgtyp, int msgflg) +static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ +	struct msgbuf __user *msgp = dest; +	size_t msgsz; + +	if (put_user(msg->m_type, &msgp->mtype)) +		return -EFAULT; + +	msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; +	if (store_msg(msgp->mtext, msg, msgsz)) +		return -EFAULT; +	return msgsz; +} + +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * This function creates new kernel message structure, large enough to store + * bufsz message bytes. + */ +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) +{ +	struct msg_msg *copy; + +	/* +	 * Create dummy message to copy real message to. +	 */ +	copy = load_msg(buf, bufsz); +	if (!IS_ERR(copy)) +		copy->m_ts = bufsz; +	return copy; +} + +static inline void free_copy(struct msg_msg *copy) +{ +	if (copy) +		free_msg(copy); +} +#else +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz) +{ +	return ERR_PTR(-ENOSYS); +} + +static inline void free_copy(struct msg_msg *copy) +{ +} +#endif + +static struct msg_msg *find_msg(struct msg_queue *msq, long *msgtyp, int mode) +{ +	struct msg_msg *msg, *found = NULL; +	long count = 0; + +	list_for_each_entry(msg, &msq->q_messages, m_list) { +		if (testmsg(msg, *msgtyp, mode) && +		    !security_msg_queue_msgrcv(msq, msg, current, +					       *msgtyp, mode)) { +			if (mode == SEARCH_LESSEQUAL && msg->m_type != 1) { +				*msgtyp = msg->m_type - 1; +				found = msg; +			} else if (mode == SEARCH_NUMBER) { +				if (*msgtyp == count) +					return msg; +			} else +				return msg; +			count++; +		} +	} + +	return found ?: ERR_PTR(-EAGAIN); +} + +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, +	       long (*msg_handler)(void __user *, struct msg_msg *, size_t))  { -	struct msg_queue *msq; -	struct msg_msg *msg;  	int mode; +	struct msg_queue *msq;  	struct ipc_namespace *ns; +	struct msg_msg *msg, *copy = NULL; + +	ns = current->nsproxy->ipc_ns; -	if (msqid < 0 || (long) msgsz < 0) +	if (msqid < 0 || (long) bufsz < 0)  		return -EINVAL; + +	if (msgflg & MSG_COPY) { +		if ((msgflg & MSG_EXCEPT) || !(msgflg & IPC_NOWAIT)) +			return -EINVAL; +		copy = prepare_copy(buf, min_t(size_t, bufsz, ns->msg_ctlmax)); +		if (IS_ERR(copy)) +			return PTR_ERR(copy); +	}  	mode = convert_mode(&msgtyp, msgflg); -	ns = current->nsproxy->ipc_ns; -	msq = msg_lock_check(ns, msqid); -	if (IS_ERR(msq)) +	rcu_read_lock(); +	msq = msq_obtain_object_check(ns, msqid); +	if (IS_ERR(msq)) { +		rcu_read_unlock(); +		free_copy(copy);  		return PTR_ERR(msq); +	}  	for (;;) {  		struct msg_receiver msr_d; -		struct list_head *tmp;  		msg = ERR_PTR(-EACCES); -		if (ipcperms(&msq->q_perm, S_IRUGO)) -			goto out_unlock; +		if (ipcperms(ns, &msq->q_perm, S_IRUGO)) +			goto out_unlock1; -		msg = ERR_PTR(-EAGAIN); -		tmp = msq->q_messages.next; -		while (tmp != &msq->q_messages) { -			struct msg_msg *walk_msg; - -			walk_msg = list_entry(tmp, struct msg_msg, m_list); -			if (testmsg(walk_msg, msgtyp, mode) && -			    !security_msg_queue_msgrcv(msq, walk_msg, current, -						       msgtyp, mode)) { - -				msg = walk_msg; -				if (mode == SEARCH_LESSEQUAL && -						walk_msg->m_type != 1) { -					msg = walk_msg; -					msgtyp = walk_msg->m_type - 1; -				} else { -					msg = walk_msg; -					break; -				} -			} -			tmp = tmp->next; +		ipc_lock_object(&msq->q_perm); + +		/* raced with RMID? */ +		if (!ipc_valid_object(&msq->q_perm)) { +			msg = ERR_PTR(-EIDRM); +			goto out_unlock0;  		} + +		msg = find_msg(msq, &msgtyp, mode);  		if (!IS_ERR(msg)) {  			/*  			 * Found a suitable message.  			 * Unlink it from the queue.  			 */ -			if ((msgsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { +			if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) {  				msg = ERR_PTR(-E2BIG); -				goto out_unlock; +				goto out_unlock0;  			} +			/* +			 * If we are copying, then do not unlink message and do +			 * not update queue parameters. +			 */ +			if (msgflg & MSG_COPY) { +				msg = copy_msg(msg, copy); +				goto out_unlock0; +			} +  			list_del(&msg->m_list);  			msq->q_qnum--;  			msq->q_rtime = get_seconds(); @@ -816,14 +892,16 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext,  			atomic_sub(msg->m_ts, &ns->msg_bytes);  			atomic_dec(&ns->msg_hdrs);  			ss_wakeup(&msq->q_senders, 0); -			msg_unlock(msq); -			break; + +			goto out_unlock0;  		} +  		/* No message waiting. Wait for a message */  		if (msgflg & IPC_NOWAIT) {  			msg = ERR_PTR(-ENOMSG); -			goto out_unlock; +			goto out_unlock0;  		} +  		list_add_tail(&msr_d.r_list, &msq->q_receivers);  		msr_d.r_tsk = current;  		msr_d.r_msgtype = msgtyp; @@ -831,22 +909,23 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext,  		if (msgflg & MSG_NOERROR)  			msr_d.r_maxsize = INT_MAX;  		else -			msr_d.r_maxsize = msgsz; +			msr_d.r_maxsize = bufsz;  		msr_d.r_msg = ERR_PTR(-EAGAIN); -		current->state = TASK_INTERRUPTIBLE; -		msg_unlock(msq); +		__set_current_state(TASK_INTERRUPTIBLE); +		ipc_unlock_object(&msq->q_perm); +		rcu_read_unlock();  		schedule();  		/* Lockless receive, part 1:  		 * Disable preemption.  We don't hold a reference to the queue  		 * and getting a reference would defeat the idea of a lockless  		 * operation, thus the code relies on rcu to guarantee the -		 * existance of msq: +		 * existence of msq:  		 * Prior to destruction, expunge_all(-EIRDM) changes r_msg.  		 * Thus if r_msg is -EAGAIN, then the queue not yet destroyed.  		 * rcu_read_lock() prevents preemption between reading r_msg -		 * and the spin_lock() inside ipc_lock_by_ptr(). +		 * and acquiring the q_perm.lock in ipc_lock_object().  		 */  		rcu_read_lock(); @@ -855,7 +934,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext,  		 * wake_up_process(). There is a race with exit(), see  		 * ipc/mqueue.c for the details.  		 */ -		msg = (struct msg_msg*)msr_d.r_msg; +		msg = (struct msg_msg *)msr_d.r_msg;  		while (msg == NULL) {  			cpu_relax();  			msg = (struct msg_msg *)msr_d.r_msg; @@ -865,63 +944,106 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext,  		 * If there is a message or an error then accept it without  		 * locking.  		 */ -		if (msg != ERR_PTR(-EAGAIN)) { -			rcu_read_unlock(); -			break; -		} +		if (msg != ERR_PTR(-EAGAIN)) +			goto out_unlock1;  		/* Lockless receive, part 3:  		 * Acquire the queue spinlock.  		 */ -		ipc_lock_by_ptr(&msq->q_perm); -		rcu_read_unlock(); +		ipc_lock_object(&msq->q_perm);  		/* Lockless receive, part 4:  		 * Repeat test after acquiring the spinlock.  		 */ -		msg = (struct msg_msg*)msr_d.r_msg; +		msg = (struct msg_msg *)msr_d.r_msg;  		if (msg != ERR_PTR(-EAGAIN)) -			goto out_unlock; +			goto out_unlock0;  		list_del(&msr_d.r_list);  		if (signal_pending(current)) {  			msg = ERR_PTR(-ERESTARTNOHAND); -out_unlock: -			msg_unlock(msq); -			break; +			goto out_unlock0;  		} + +		ipc_unlock_object(&msq->q_perm);  	} -	if (IS_ERR(msg)) -		return PTR_ERR(msg); -	msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz; -	*pmtype = msg->m_type; -	if (store_msg(mtext, msg, msgsz)) -		msgsz = -EFAULT; +out_unlock0: +	ipc_unlock_object(&msq->q_perm); +out_unlock1: +	rcu_read_unlock(); +	if (IS_ERR(msg)) { +		free_copy(copy); +		return PTR_ERR(msg); +	} +	bufsz = msg_handler(buf, msg, bufsz);  	free_msg(msg); -	return msgsz; +	return bufsz;  }  SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,  		long, msgtyp, int, msgflg)  { -	long err, mtype; +	return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); +} -	err =  do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); -	if (err < 0) -		goto out; +/* + * Scale msgmni with the available lowmem size: the memory dedicated to msg + * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. + * Also take into account the number of nsproxies created so far. + * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. + */ +void recompute_msgmni(struct ipc_namespace *ns) +{ +	struct sysinfo i; +	unsigned long allowed; +	int nb_ns; -	if (put_user(mtype, &msgp->mtype)) -		err = -EFAULT; -out: -	return err; +	si_meminfo(&i); +	allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) +		/ MSGMNB; +	nb_ns = atomic_read(&nr_ipc_ns); +	allowed /= nb_ns; + +	if (allowed < MSGMNI) { +		ns->msg_ctlmni = MSGMNI; +		return; +	} + +	if (allowed > IPCMNI / nb_ns) { +		ns->msg_ctlmni = IPCMNI / nb_ns; +		return; +	} + +	ns->msg_ctlmni = allowed; +} + +void msg_init_ns(struct ipc_namespace *ns) +{ +	ns->msg_ctlmax = MSGMAX; +	ns->msg_ctlmnb = MSGMNB; + +	recompute_msgmni(ns); + +	atomic_set(&ns->msg_bytes, 0); +	atomic_set(&ns->msg_hdrs, 0); +	ipc_init_ids(&ns->ids[IPC_MSG_IDS]); +} + +#ifdef CONFIG_IPC_NS +void msg_exit_ns(struct ipc_namespace *ns) +{ +	free_ipcs(ns, &msg_ids(ns), freeque); +	idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);  } +#endif  #ifdef CONFIG_PROC_FS  static int sysvipc_msg_proc_show(struct seq_file *s, void *it)  { +	struct user_namespace *user_ns = seq_user_ns(s);  	struct msg_queue *msq = it;  	return seq_printf(s, @@ -933,12 +1055,24 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it)  			msq->q_qnum,  			msq->q_lspid,  			msq->q_lrpid, -			msq->q_perm.uid, -			msq->q_perm.gid, -			msq->q_perm.cuid, -			msq->q_perm.cgid, +			from_kuid_munged(user_ns, msq->q_perm.uid), +			from_kgid_munged(user_ns, msq->q_perm.gid), +			from_kuid_munged(user_ns, msq->q_perm.cuid), +			from_kgid_munged(user_ns, msq->q_perm.cgid),  			msq->q_stime,  			msq->q_rtime,  			msq->q_ctime);  }  #endif + +void __init msg_init(void) +{ +	msg_init_ns(&init_ipc_ns); + +	printk(KERN_INFO "msgmni has been set to %d\n", +		init_ipc_ns.msg_ctlmni); + +	ipc_init_proc_interface("sysvipc/msg", +				"       key      msqid perms      cbytes       qnum lspid lrpid   uid   gid  cuid  cgid      stime      rtime      ctime\n", +				IPC_MSG_IDS, sysvipc_msg_proc_show); +}  | 
