diff options
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/msg.c | 32 | ||||
| -rw-r--r-- | ipc/sem.c | 256 | ||||
| -rw-r--r-- | ipc/shm.c | 17 | ||||
| -rw-r--r-- | ipc/util.c | 59 | ||||
| -rw-r--r-- | ipc/util.h | 10 | 
5 files changed, 250 insertions, 124 deletions
| diff --git a/ipc/msg.c b/ipc/msg.c index b0d541d42677..558aa91186b6 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -165,6 +165,15 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)  	ipc_rmid(&msg_ids(ns), &s->q_perm);  } +static void msg_rcu_free(struct rcu_head *head) +{ +	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); +	struct msg_queue *msq = ipc_rcu_to_struct(p); + +	security_msg_queue_free(msq); +	ipc_rcu_free(head); +} +  /**   * newque - Create a new msg queue   * @ns: namespace @@ -189,15 +198,14 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params)  	msq->q_perm.security = NULL;  	retval = security_msg_queue_alloc(msq);  	if (retval) { -		ipc_rcu_putref(msq); +		ipc_rcu_putref(msq, ipc_rcu_free);  		return retval;  	}  	/* ipc_addid() locks msq upon success. */  	id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni);  	if (id < 0) { -		security_msg_queue_free(msq); -		ipc_rcu_putref(msq); +		ipc_rcu_putref(msq, msg_rcu_free);  		return id;  	} @@ -276,8 +284,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)  		free_msg(msg);  	}  	atomic_sub(msq->q_cbytes, &ns->msg_bytes); -	security_msg_queue_free(msq); -	ipc_rcu_putref(msq); +	ipc_rcu_putref(msq, msg_rcu_free);  }  /* @@ -688,6 +695,12 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,  		if (ipcperms(ns, &msq->q_perm, S_IWUGO))  			goto out_unlock0; +		/* raced with RMID? */ +		if (msq->q_perm.deleted) { +			err = -EIDRM; +			goto out_unlock0; +		} +  		err = security_msg_queue_msgsnd(msq, msg, msgflg);  		if (err)  			goto out_unlock0; @@ -717,7 +730,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext,  		rcu_read_lock();  		ipc_lock_object(&msq->q_perm); -		ipc_rcu_putref(msq); +		ipc_rcu_putref(msq, ipc_rcu_free);  		if (msq->q_perm.deleted) {  			err = -EIDRM;  			goto out_unlock0; @@ -894,6 +907,13 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl  			goto out_unlock1;  		ipc_lock_object(&msq->q_perm); + +		/* raced with RMID? */ +		if (msq->q_perm.deleted) { +			msg = ERR_PTR(-EIDRM); +			goto out_unlock0; +		} +  		msg = find_msg(msq, &msgtyp, mode);  		if (!IS_ERR(msg)) {  			/* diff --git a/ipc/sem.c b/ipc/sem.c index 69b6a21f3844..db9d241af133 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -243,71 +243,122 @@ static void merge_queues(struct sem_array *sma)  	}  } +static void sem_rcu_free(struct rcu_head *head) +{ +	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); +	struct sem_array *sma = ipc_rcu_to_struct(p); + +	security_sem_free(sma); +	ipc_rcu_free(head); +} + +/* + * Wait until all currently ongoing simple ops have completed. + * Caller must own sem_perm.lock. + * New simple ops cannot start, because simple ops first check + * that sem_perm.lock is free. + * that a) sem_perm.lock is free and b) complex_count is 0. + */ +static void sem_wait_array(struct sem_array *sma) +{ +	int i; +	struct sem *sem; + +	if (sma->complex_count)  { +		/* The thread that increased sma->complex_count waited on +		 * all sem->lock locks. Thus we don't need to wait again. +		 */ +		return; +	} + +	for (i = 0; i < sma->sem_nsems; i++) { +		sem = sma->sem_base + i; +		spin_unlock_wait(&sem->lock); +	} +} +  /*   * If the request contains only one semaphore operation, and there are   * no complex transactions pending, lock only the semaphore involved.   * Otherwise, lock the entire semaphore array, since we either have   * multiple semaphores in our own semops, or we need to look at   * semaphores from other pending complex operations. - * - * Carefully guard against sma->complex_count changing between zero - * and non-zero while we are spinning for the lock. The value of - * sma->complex_count cannot change while we are holding the lock, - * so sem_unlock should be fine. - * - * The global lock path checks that all the local locks have been released, - * checking each local lock once. This means that the local lock paths - * cannot start their critical sections while the global lock is held.   */  static inline int sem_lock(struct sem_array *sma, struct sembuf *sops,  			      int nsops)  { -	int locknum; - again: -	if (nsops == 1 && !sma->complex_count) { -		struct sem *sem = sma->sem_base + sops->sem_num; +	struct sem *sem; -		/* Lock just the semaphore we are interested in. */ -		spin_lock(&sem->lock); +	if (nsops != 1) { +		/* Complex operation - acquire a full lock */ +		ipc_lock_object(&sma->sem_perm); -		/* -		 * If sma->complex_count was set while we were spinning, -		 * we may need to look at things we did not lock here. +		/* And wait until all simple ops that are processed +		 * right now have dropped their locks.  		 */ -		if (unlikely(sma->complex_count)) { -			spin_unlock(&sem->lock); -			goto lock_array; -		} +		sem_wait_array(sma); +		return -1; +	} + +	/* +	 * Only one semaphore affected - try to optimize locking. +	 * The rules are: +	 * - optimized locking is possible if no complex operation +	 *   is either enqueued or processed right now. +	 * - The test for enqueued complex ops is simple: +	 *      sma->complex_count != 0 +	 * - Testing for complex ops that are processed right now is +	 *   a bit more difficult. Complex ops acquire the full lock +	 *   and first wait that the running simple ops have completed. +	 *   (see above) +	 *   Thus: If we own a simple lock and the global lock is free +	 *	and complex_count is now 0, then it will stay 0 and +	 *	thus just locking sem->lock is sufficient. +	 */ +	sem = sma->sem_base + sops->sem_num; +	if (sma->complex_count == 0) {  		/* -		 * Another process is holding the global lock on the -		 * sem_array; we cannot enter our critical section, -		 * but have to wait for the global lock to be released. +		 * It appears that no complex operation is around. +		 * Acquire the per-semaphore lock.  		 */ -		if (unlikely(spin_is_locked(&sma->sem_perm.lock))) { -			spin_unlock(&sem->lock); -			spin_unlock_wait(&sma->sem_perm.lock); -			goto again; +		spin_lock(&sem->lock); + +		/* Then check that the global lock is free */ +		if (!spin_is_locked(&sma->sem_perm.lock)) { +			/* spin_is_locked() is not a memory barrier */ +			smp_mb(); + +			/* Now repeat the test of complex_count: +			 * It can't change anymore until we drop sem->lock. +			 * Thus: if is now 0, then it will stay 0. +			 */ +			if (sma->complex_count == 0) { +				/* fast path successful! */ +				return sops->sem_num; +			}  		} +		spin_unlock(&sem->lock); +	} -		locknum = sops->sem_num; +	/* slow path: acquire the full lock */ +	ipc_lock_object(&sma->sem_perm); + +	if (sma->complex_count == 0) { +		/* False alarm: +		 * There is no complex operation, thus we can switch +		 * back to the fast path. +		 */ +		spin_lock(&sem->lock); +		ipc_unlock_object(&sma->sem_perm); +		return sops->sem_num;  	} else { -		int i; -		/* -		 * Lock the semaphore array, and wait for all of the -		 * individual semaphore locks to go away.  The code -		 * above ensures no new single-lock holders will enter -		 * their critical section while the array lock is held. +		/* Not a false alarm, thus complete the sequence for a +		 * full lock.  		 */ - lock_array: -		ipc_lock_object(&sma->sem_perm); -		for (i = 0; i < sma->sem_nsems; i++) { -			struct sem *sem = sma->sem_base + i; -			spin_unlock_wait(&sem->lock); -		} -		locknum = -1; +		sem_wait_array(sma); +		return -1;  	} -	return locknum;  }  static inline void sem_unlock(struct sem_array *sma, int locknum) @@ -374,12 +425,7 @@ static inline struct sem_array *sem_obtain_object_check(struct ipc_namespace *ns  static inline void sem_lock_and_putref(struct sem_array *sma)  {  	sem_lock(sma, NULL, -1); -	ipc_rcu_putref(sma); -} - -static inline void sem_putref(struct sem_array *sma) -{ -	ipc_rcu_putref(sma); +	ipc_rcu_putref(sma, ipc_rcu_free);  }  static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) @@ -458,14 +504,13 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)  	sma->sem_perm.security = NULL;  	retval = security_sem_alloc(sma);  	if (retval) { -		ipc_rcu_putref(sma); +		ipc_rcu_putref(sma, ipc_rcu_free);  		return retval;  	}  	id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni);  	if (id < 0) { -		security_sem_free(sma); -		ipc_rcu_putref(sma); +		ipc_rcu_putref(sma, sem_rcu_free);  		return id;  	}  	ns->used_sems += nsems; @@ -873,6 +918,24 @@ again:  }  /** + * set_semotime(sma, sops) - set sem_otime + * @sma: semaphore array + * @sops: operations that modified the array, may be NULL + * + * sem_otime is replicated to avoid cache line trashing. + * This function sets one instance to the current time. + */ +static void set_semotime(struct sem_array *sma, struct sembuf *sops) +{ +	if (sops == NULL) { +		sma->sem_base[0].sem_otime = get_seconds(); +	} else { +		sma->sem_base[sops[0].sem_num].sem_otime = +							get_seconds(); +	} +} + +/**   * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue   * @sma: semaphore array   * @sops: operations that were performed @@ -922,17 +985,10 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop  			}  		}  	} -	if (otime) { -		if (sops == NULL) { -			sma->sem_base[0].sem_otime = get_seconds(); -		} else { -			sma->sem_base[sops[0].sem_num].sem_otime = -								get_seconds(); -		} -	} +	if (otime) +		set_semotime(sma, sops);  } -  /* The following counts are associated to each semaphore:   *   semncnt        number of tasks waiting on semval being nonzero   *   semzcnt        number of tasks waiting on semval being zero @@ -1047,8 +1103,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)  	wake_up_sem_queue_do(&tasks);  	ns->used_sems -= sma->sem_nsems; -	security_sem_free(sma); -	ipc_rcu_putref(sma); +	ipc_rcu_putref(sma, sem_rcu_free);  }  static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, int version) @@ -1227,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,  	sem_lock(sma, NULL, -1); +	if (sma->sem_perm.deleted) { +		sem_unlock(sma, -1); +		rcu_read_unlock(); +		return -EIDRM; +	} +  	curr = &sma->sem_base[semnum];  	ipc_assert_locked_object(&sma->sem_perm); @@ -1281,28 +1342,28 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,  		int i;  		sem_lock(sma, NULL, -1); +		if (sma->sem_perm.deleted) { +			err = -EIDRM; +			goto out_unlock; +		}  		if(nsems > SEMMSL_FAST) {  			if (!ipc_rcu_getref(sma)) { -				sem_unlock(sma, -1); -				rcu_read_unlock();  				err = -EIDRM; -				goto out_free; +				goto out_unlock;  			}  			sem_unlock(sma, -1);  			rcu_read_unlock();  			sem_io = ipc_alloc(sizeof(ushort)*nsems);  			if(sem_io == NULL) { -				sem_putref(sma); +				ipc_rcu_putref(sma, ipc_rcu_free);  				return -ENOMEM;  			}  			rcu_read_lock();  			sem_lock_and_putref(sma);  			if (sma->sem_perm.deleted) { -				sem_unlock(sma, -1); -				rcu_read_unlock();  				err = -EIDRM; -				goto out_free; +				goto out_unlock;  			}  		}  		for (i = 0; i < sma->sem_nsems; i++) @@ -1320,28 +1381,28 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,  		struct sem_undo *un;  		if (!ipc_rcu_getref(sma)) { -			rcu_read_unlock(); -			return -EIDRM; +			err = -EIDRM; +			goto out_rcu_wakeup;  		}  		rcu_read_unlock();  		if(nsems > SEMMSL_FAST) {  			sem_io = ipc_alloc(sizeof(ushort)*nsems);  			if(sem_io == NULL) { -				sem_putref(sma); +				ipc_rcu_putref(sma, ipc_rcu_free);  				return -ENOMEM;  			}  		}  		if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) { -			sem_putref(sma); +			ipc_rcu_putref(sma, ipc_rcu_free);  			err = -EFAULT;  			goto out_free;  		}  		for (i = 0; i < nsems; i++) {  			if (sem_io[i] > SEMVMX) { -				sem_putref(sma); +				ipc_rcu_putref(sma, ipc_rcu_free);  				err = -ERANGE;  				goto out_free;  			} @@ -1349,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,  		rcu_read_lock();  		sem_lock_and_putref(sma);  		if (sma->sem_perm.deleted) { -			sem_unlock(sma, -1); -			rcu_read_unlock();  			err = -EIDRM; -			goto out_free; +			goto out_unlock;  		}  		for (i = 0; i < nsems; i++) @@ -1376,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,  		goto out_rcu_wakeup;  	sem_lock(sma, NULL, -1); +	if (sma->sem_perm.deleted) { +		err = -EIDRM; +		goto out_unlock; +	}  	curr = &sma->sem_base[semnum];  	switch (cmd) { @@ -1629,7 +1692,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)  	/* step 2: allocate new undo structure */  	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);  	if (!new) { -		sem_putref(sma); +		ipc_rcu_putref(sma, ipc_rcu_free);  		return ERR_PTR(-ENOMEM);  	} @@ -1781,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,  	if (error)  		goto out_rcu_wakeup; +	error = -EIDRM; +	locknum = sem_lock(sma, sops, nsops); +	if (sma->sem_perm.deleted) +		goto out_unlock_free;  	/*  	 * semid identifiers are not unique - find_alloc_undo may have  	 * allocated an undo structure, it was invalidated by an RMID @@ -1788,19 +1855,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,  	 * This case can be detected checking un->semid. The existence of  	 * "un" itself is guaranteed by rcu.  	 */ -	error = -EIDRM; -	locknum = sem_lock(sma, sops, nsops);  	if (un && un->semid == -1)  		goto out_unlock_free;  	error = perform_atomic_semop(sma, sops, nsops, un,  					task_tgid_vnr(current)); -	if (error <= 0) { -		if (alter && error == 0) +	if (error == 0) { +		/* If the operation was successful, then do +		 * the required updates. +		 */ +		if (alter)  			do_smart_update(sma, sops, nsops, 1, &tasks); - -		goto out_unlock_free; +		else +			set_semotime(sma, sops);  	} +	if (error <= 0) +		goto out_unlock_free;  	/* We need to sleep on this operation, so we put the current  	 * task into the pending queue and go to sleep. @@ -1997,6 +2067,12 @@ void exit_sem(struct task_struct *tsk)  		}  		sem_lock(sma, NULL, -1); +		/* exit_sem raced with IPC_RMID, nothing to do */ +		if (sma->sem_perm.deleted) { +			sem_unlock(sma, -1); +			rcu_read_unlock(); +			continue; +		}  		un = __lookup_undo(ulp, semid);  		if (un == NULL) {  			/* exit_sem raced with IPC_RMID+semget() that created @@ -2059,6 +2135,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)  	struct sem_array *sma = it;  	time_t sem_otime; +	/* +	 * The proc interface isn't aware of sem_lock(), it calls +	 * ipc_lock_object() directly (in sysvipc_find_ipc). +	 * In order to stay compatible with sem_lock(), we must wait until +	 * all simple semop() calls have left their critical regions. +	 */ +	sem_wait_array(sma); +  	sem_otime = get_semotime(sma);  	return seq_printf(s, diff --git a/ipc/shm.c b/ipc/shm.c index 2821cdf93adb..d69739610fd4 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -167,6 +167,15 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)  	ipc_lock_object(&ipcp->shm_perm);  } +static void shm_rcu_free(struct rcu_head *head) +{ +	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); +	struct shmid_kernel *shp = ipc_rcu_to_struct(p); + +	security_shm_free(shp); +	ipc_rcu_free(head); +} +  static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)  {  	ipc_rmid(&shm_ids(ns), &s->shm_perm); @@ -208,8 +217,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)  		user_shm_unlock(file_inode(shp->shm_file)->i_size,  						shp->mlock_user);  	fput (shp->shm_file); -	security_shm_free(shp); -	ipc_rcu_putref(shp); +	ipc_rcu_putref(shp, shm_rcu_free);  }  /* @@ -497,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)  	shp->shm_perm.security = NULL;  	error = security_shm_alloc(shp);  	if (error) { -		ipc_rcu_putref(shp); +		ipc_rcu_putref(shp, ipc_rcu_free);  		return error;  	} @@ -566,8 +574,7 @@ no_id:  		user_shm_unlock(size, shp->mlock_user);  	fput(file);  no_file: -	security_shm_free(shp); -	ipc_rcu_putref(shp); +	ipc_rcu_putref(shp, shm_rcu_free);  	return error;  } diff --git a/ipc/util.c b/ipc/util.c index e829da9ed01f..7684f41bce76 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -17,12 +17,27 @@   *            Pavel Emelianov <xemul@openvz.org>   *   * General sysv ipc locking scheme: - *  when doing ipc id lookups, take the ids->rwsem - *      rcu_read_lock() - *          obtain the ipc object (kern_ipc_perm) - *          perform security, capabilities, auditing and permission checks, etc. - *          acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() - *             perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) + *	rcu_read_lock() + *          obtain the ipc object (kern_ipc_perm) by looking up the id in an idr + *	    tree. + *	    - perform initial checks (capabilities, auditing and permission, + *	      etc). + *	    - perform read-only operations, such as STAT, INFO commands. + *	      acquire the ipc lock (kern_ipc_perm.lock) through + *	      ipc_lock_object() + *		- perform data updates, such as SET, RMID commands and + *		  mechanism-specific operations (semop/semtimedop, + *		  msgsnd/msgrcv, shmat/shmdt). + *	    drop the ipc lock, through ipc_unlock_object(). + *	rcu_read_unlock() + * + *  The ids->rwsem must be taken when: + *	- creating, removing and iterating the existing entries in ipc + *	  identifier sets. + *	- iterating through files under /proc/sysvipc/ + * + *  Note that sems have a special fast path that avoids kern_ipc_perm.lock - + *  see sem_lock().   */  #include <linux/mm.h> @@ -474,11 +489,6 @@ void ipc_free(void* ptr, int size)  		kfree(ptr);  } -struct ipc_rcu { -	struct rcu_head rcu; -	atomic_t refcount; -} ____cacheline_aligned_in_smp; -  /**   *	ipc_rcu_alloc	-	allocate ipc and rcu space    *	@size: size desired @@ -505,27 +515,24 @@ int ipc_rcu_getref(void *ptr)  	return atomic_inc_not_zero(&p->refcount);  } -/** - * ipc_schedule_free - free ipc + rcu space - * @head: RCU callback structure for queued work - */ -static void ipc_schedule_free(struct rcu_head *head) -{ -	vfree(container_of(head, struct ipc_rcu, rcu)); -} - -void ipc_rcu_putref(void *ptr) +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head))  {  	struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;  	if (!atomic_dec_and_test(&p->refcount))  		return; -	if (is_vmalloc_addr(ptr)) { -		call_rcu(&p->rcu, ipc_schedule_free); -	} else { -		kfree_rcu(p, rcu); -	} +	call_rcu(&p->rcu, func); +} + +void ipc_rcu_free(struct rcu_head *head) +{ +	struct ipc_rcu *p = container_of(head, struct ipc_rcu, rcu); + +	if (is_vmalloc_addr(p)) +		vfree(p); +	else +		kfree(p);  }  /** diff --git a/ipc/util.h b/ipc/util.h index c5f3338ba1fa..f2f5036f2eed 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -47,6 +47,13 @@ static inline void msg_exit_ns(struct ipc_namespace *ns) { }  static inline void shm_exit_ns(struct ipc_namespace *ns) { }  #endif +struct ipc_rcu { +	struct rcu_head rcu; +	atomic_t refcount; +} ____cacheline_aligned_in_smp; + +#define ipc_rcu_to_struct(p)  ((void *)(p+1)) +  /*   * Structure that holds the parameters needed by the ipc operations   * (see after) @@ -120,7 +127,8 @@ void ipc_free(void* ptr, int size);   */  void* ipc_rcu_alloc(int size);  int ipc_rcu_getref(void *ptr); -void ipc_rcu_putref(void *ptr); +void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head)); +void ipc_rcu_free(struct rcu_head *head);  struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);  struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id); | 
