From 03f595668017f1a1fb971c02fc37140bc6e7bb1c Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:50 -0800 Subject: ipc: add sysctl to specify desired next object id Add 3 new variables and sysctls to tune them (by one "next_id" variable for messages, semaphores and shared memory respectively). This variable can be used to set desired id for next allocated IPC object. By default it's equal to -1 and old behaviour is preserved. If this variable is non-negative, then desired idr will be extracted from it and used as a start value to search for free IDR slot. Notes: 1) this patch doesn't guarantee that the new object will have desired id. So it's up to user space how to handle new object with wrong id. 2) After a sucessful id allocation attempt, "next_id" will be set back to -1 (if it was non-negative). [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index fe771978e877..ae221a7b5092 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -24,6 +24,7 @@ struct ipc_ids { unsigned short seq_max; struct rw_semaphore rw_mutex; struct idr ipcs_idr; + int next_id; }; struct ipc_namespace { -- cgit v1.2.3 From f9dd87f4738c7555aca2cdf8cb2b2326cafb0cad Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:52 -0800 Subject: ipc: message queue receive cleanup Move all message related manipulation into one function msg_fill(). Actually, two functions because of the compat one. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msg.h | 5 +++-- ipc/compat.c | 45 +++++++++++++++++++-------------------------- ipc/msg.c | 44 +++++++++++++++++++++++--------------------- 3 files changed, 45 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/msg.h b/include/linux/msg.h index 7a4b9e97d29a..fc5743a554e6 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -34,7 +34,8 @@ struct msg_queue { /* Helper routines for sys_msgsnd and sys_msgrcv */ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg); -extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, - size_t msgsz, long msgtyp, int msgflg); +extern long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, + int msgflg, + long (*msg_fill)(void __user *, struct msg_msg *, size_t)); #endif /* _LINUX_MSG_H */ diff --git a/ipc/compat.c b/ipc/compat.c index ad9518eb26e0..eb3ea16d2d1d 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -306,6 +306,20 @@ static long do_compat_semctl(int first, int second, int third, u32 pad) return err; } +long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ + struct compat_msgbuf __user *msgp = dest; + size_t msgsz; + + if (put_user(msg->m_type, &msgp->mtype)) + return -EFAULT; + + msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; + if (store_msg(msgp->mtext, msg, msgsz)) + return -EFAULT; + return msgsz; +} + #ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC long compat_sys_semctl(int first, int second, int third, void __user *uptr) { @@ -337,10 +351,6 @@ long compat_sys_msgsnd(int first, int second, int third, void __user *uptr) long compat_sys_msgrcv(int first, int second, int msgtyp, int third, int version, void __user *uptr) { - struct compat_msgbuf __user *up; - long type; - int err; - if (first < 0) return -EINVAL; if (second < 0) @@ -348,23 +358,14 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third, if (!version) { struct compat_ipc_kludge ipck; - err = -EINVAL; if (!uptr) - goto out; - err = -EFAULT; + return -EINVAL; if (copy_from_user (&ipck, uptr, sizeof(ipck))) - goto out; + return -EFAULT; uptr = compat_ptr(ipck.msgp); msgtyp = ipck.msgtyp; } - up = uptr; - err = do_msgrcv(first, &type, up->mtext, second, msgtyp, third); - if (err < 0) - goto out; - if (put_user(type, &up->mtype)) - err = -EFAULT; -out: - return err; + return do_msgrcv(first, uptr, second, msgtyp, third, compat_do_msg_fill); } #else long compat_sys_semctl(int semid, int semnum, int cmd, int arg) @@ -385,16 +386,8 @@ long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp, long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp, compat_ssize_t msgsz, long msgtyp, int msgflg) { - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, (ssize_t)msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; - out: - return err; + return do_msgrcv(msqid, msgp, (ssize_t)msgsz, msgtyp, msgflg, + compat_do_msg_fill); } #endif diff --git a/ipc/msg.c b/ipc/msg.c index 2f272fa76595..cefc24f46e3e 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -755,15 +755,30 @@ static inline int convert_mode(long *msgtyp, int msgflg) return SEARCH_EQUAL; } -long do_msgrcv(int msqid, long *pmtype, void __user *mtext, - size_t msgsz, long msgtyp, int msgflg) +static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) +{ + struct msgbuf __user *msgp = dest; + size_t msgsz; + + if (put_user(msg->m_type, &msgp->mtype)) + return -EFAULT; + + msgsz = (bufsz > msg->m_ts) ? msg->m_ts : bufsz; + if (store_msg(msgp->mtext, msg, msgsz)) + return -EFAULT; + return msgsz; +} + +long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, + int msgflg, + long (*msg_handler)(void __user *, struct msg_msg *, size_t)) { struct msg_queue *msq; struct msg_msg *msg; int mode; struct ipc_namespace *ns; - if (msqid < 0 || (long) msgsz < 0) + if (msqid < 0 || (long) bufsz < 0) return -EINVAL; mode = convert_mode(&msgtyp, msgflg); ns = current->nsproxy->ipc_ns; @@ -804,7 +819,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, * Found a suitable message. * Unlink it from the queue. */ - if ((msgsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { + if ((bufsz < msg->m_ts) && !(msgflg & MSG_NOERROR)) { msg = ERR_PTR(-E2BIG); goto out_unlock; } @@ -831,7 +846,7 @@ long do_msgrcv(int msqid, long *pmtype, void __user *mtext, if (msgflg & MSG_NOERROR) msr_d.r_maxsize = INT_MAX; else - msr_d.r_maxsize = msgsz; + msr_d.r_maxsize = bufsz; msr_d.r_msg = ERR_PTR(-EAGAIN); current->state = TASK_INTERRUPTIBLE; msg_unlock(msq); @@ -894,29 +909,16 @@ out_unlock: if (IS_ERR(msg)) return PTR_ERR(msg); - msgsz = (msgsz > msg->m_ts) ? msg->m_ts : msgsz; - *pmtype = msg->m_type; - if (store_msg(mtext, msg, msgsz)) - msgsz = -EFAULT; - + bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); - return msgsz; + return bufsz; } SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, long, msgtyp, int, msgflg) { - long err, mtype; - - err = do_msgrcv(msqid, &mtype, msgp->mtext, msgsz, msgtyp, msgflg); - if (err < 0) - goto out; - - if (put_user(mtype, &msgp->mtype)) - err = -EFAULT; -out: - return err; + return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } #ifdef CONFIG_PROC_FS -- cgit v1.2.3 From 4a674f34ba04a002244edaf891b5da7fc1473ae8 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:55 -0800 Subject: ipc: introduce message queue copy feature This patch is required for checkpoint/restore in userspace. c/r requires some way to get all pending IPC messages without deleting them from the queue (checkpoint can fail and in this case tasks will be resumed, so queue have to be valid). To achive this, new operation flag MSG_COPY for sys_msgrcv() system call was introduced. If this flag was specified, then mtype is interpreted as number of the message to copy. If MSG_COPY is set, then kernel will allocate dummy message with passed size, and then use new copy_msg() helper function to copy desired message (instead of unlinking it from the queue). Notes: 1) Return -ENOSYS if MSG_COPY is specified, but CONFIG_CHECKPOINT_RESTORE is not set. Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/msg.h | 1 + ipc/msg.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++-- ipc/msgutil.c | 38 ++++++++++++++++++++++++++++ ipc/util.h | 1 + 4 files changed, 102 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index 78dbd2f996a3..22d95c6854e0 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -10,6 +10,7 @@ /* msgrcv options */ #define MSG_NOERROR 010000 /* no error if message is too big */ #define MSG_EXCEPT 020000 /* recv any msg except of specified type.*/ +#define MSG_COPY 040000 /* copy (not remove) all queue messages */ /* Obsolete, used only for backwards compatibility and libc5 compiles */ struct msqid_ds { diff --git a/ipc/msg.c b/ipc/msg.c index cefc24f46e3e..d20ffc7d3f24 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -769,6 +769,45 @@ static long do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz) return msgsz; } +#ifdef CONFIG_CHECKPOINT_RESTORE +static inline struct msg_msg *fill_copy(unsigned long copy_nr, + unsigned long msg_nr, + struct msg_msg *msg, + struct msg_msg *copy) +{ + if (copy_nr == msg_nr) + return copy_msg(msg, copy); + return NULL; +} + +static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz, + int msgflg, long *msgtyp, + unsigned long *copy_number) +{ + struct msg_msg *copy; + + *copy_number = *msgtyp; + *msgtyp = 0; + /* + * Create dummy message to copy real message to. + */ + copy = load_msg(buf, bufsz); + if (!IS_ERR(copy)) + copy->m_ts = bufsz; + return copy; +} + +static inline void free_copy(int msgflg, struct msg_msg *copy) +{ + if (msgflg & MSG_COPY) + free_msg(copy); +} +#else +#define free_copy(msgflg, copy) do {} while (0) +#define prepare_copy(buf, sz, msgflg, msgtyp, copy_nr) ERR_PTR(-ENOSYS) +#define fill_copy(copy_nr, msg_nr, msg, copy) NULL +#endif + long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, long (*msg_handler)(void __user *, struct msg_msg *, size_t)) @@ -777,19 +816,29 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, struct msg_msg *msg; int mode; struct ipc_namespace *ns; + struct msg_msg *copy; + unsigned long __maybe_unused copy_number; if (msqid < 0 || (long) bufsz < 0) return -EINVAL; + if (msgflg & MSG_COPY) { + copy = prepare_copy(buf, bufsz, msgflg, &msgtyp, ©_number); + if (IS_ERR(copy)) + return PTR_ERR(copy); + } mode = convert_mode(&msgtyp, msgflg); ns = current->nsproxy->ipc_ns; msq = msg_lock_check(ns, msqid); - if (IS_ERR(msq)) + if (IS_ERR(msq)) { + free_copy(msgflg, copy); return PTR_ERR(msq); + } for (;;) { struct msg_receiver msr_d; struct list_head *tmp; + long msg_counter = 0; msg = ERR_PTR(-EACCES); if (ipcperms(ns, &msq->q_perm, S_IRUGO)) @@ -809,8 +858,15 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, if (mode == SEARCH_LESSEQUAL && walk_msg->m_type != 1) { msgtyp = walk_msg->m_type - 1; + } else if (msgflg & MSG_COPY) { + msg = fill_copy(copy_number, + msg_counter, + walk_msg, copy); + if (msg) + break; } else break; + msg_counter++; } tmp = tmp->next; } @@ -823,6 +879,8 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, msg = ERR_PTR(-E2BIG); goto out_unlock; } + if (msgflg & MSG_COPY) + goto out_unlock; list_del(&msg->m_list); msq->q_qnum--; msq->q_rtime = get_seconds(); @@ -906,8 +964,10 @@ out_unlock: break; } } - if (IS_ERR(msg)) + if (IS_ERR(msg)) { + free_copy(msgflg, copy); return PTR_ERR(msg); + } bufsz = msg_handler(buf, msg, bufsz); free_msg(msg); diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 6471f1bdae96..7eecdad40efc 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -102,7 +102,45 @@ out_err: free_msg(msg); return ERR_PTR(err); } +#ifdef CONFIG_CHECKPOINT_RESTORE +struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst) +{ + struct msg_msgseg *dst_pseg, *src_pseg; + int len = src->m_ts; + int alen; + + BUG_ON(dst == NULL); + if (src->m_ts > dst->m_ts) + return ERR_PTR(-EINVAL); + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + + dst->next = NULL; + dst->security = NULL; + memcpy(dst + 1, src + 1, alen); + + len -= alen; + dst_pseg = dst->next; + src_pseg = src->next; + while (len > 0) { + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + memcpy(dst_pseg + 1, src_pseg + 1, alen); + dst_pseg = dst_pseg->next; + len -= alen; + src_pseg = src_pseg->next; + } + + dst->m_type = src->m_type; + dst->m_ts = src->m_ts; + + return dst; +} +#endif int store_msg(void __user *dest, struct msg_msg *msg, int len) { int alen; diff --git a/ipc/util.h b/ipc/util.h index a61e0ca2bffd..eeb79a1fbd83 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -140,6 +140,7 @@ int ipc_parse_version (int *cmd); extern void free_msg(struct msg_msg *msg); extern struct msg_msg *load_msg(const void __user *src, int len); +extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst); extern int store_msg(void __user *dest, struct msg_msg *msg, int len); extern void recompute_msgmni(struct ipc_namespace *); -- cgit v1.2.3 From 3a665531a3b7c2ad2c87903b24646be6916340e4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Fri, 4 Jan 2013 15:34:56 -0800 Subject: selftests: IPC message queue copy feature test This test can be used to check wheither kernel supports IPC message queue copy and restore features (required by CRIU project). Signed-off-by: Stanislav Kinsbursky Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Al Viro Cc: KOSAKI Motohiro Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msg.h | 3 +- ipc/compat.c | 3 +- tools/testing/selftests/ipc/Makefile | 25 ++++ tools/testing/selftests/ipc/msgque.c | 246 +++++++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ipc/Makefile create mode 100644 tools/testing/selftests/ipc/msgque.c (limited to 'include') diff --git a/include/linux/msg.h b/include/linux/msg.h index fc5743a554e6..391af8d11cce 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -36,6 +36,7 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, size_t msgsz, int msgflg); extern long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgflg, - long (*msg_fill)(void __user *, struct msg_msg *, size_t)); + long (*msg_fill)(void __user *, struct msg_msg *, + size_t)); #endif /* _LINUX_MSG_H */ diff --git a/ipc/compat.c b/ipc/compat.c index eb3ea16d2d1d..2547f29dcd1b 100644 --- a/ipc/compat.c +++ b/ipc/compat.c @@ -365,7 +365,8 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third, uptr = compat_ptr(ipck.msgp); msgtyp = ipck.msgtyp; } - return do_msgrcv(first, uptr, second, msgtyp, third, compat_do_msg_fill); + return do_msgrcv(first, uptr, second, msgtyp, third, + compat_do_msg_fill); } #else long compat_sys_semctl(int semid, int semnum, int cmd, int arg) diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile new file mode 100644 index 000000000000..5386fd7c43ae --- /dev/null +++ b/tools/testing/selftests/ipc/Makefile @@ -0,0 +1,25 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../usr/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) msgque.c -o msgque_test +else + echo "Not an x86 target, can't build msgque selftest" +endif + +run_tests: all + ./msgque_test + +clean: + rm -fr ./msgque_test diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c new file mode 100644 index 000000000000..d66418237d21 --- /dev/null +++ b/tools/testing/selftests/ipc/msgque.c @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include + +#define MAX_MSG_SIZE 32 + +struct msg1 { + int msize; + long mtype; + char mtext[MAX_MSG_SIZE]; +}; + +#define TEST_STRING "Test sysv5 msg" +#define MSG_TYPE 1 + +#define ANOTHER_TEST_STRING "Yet another test sysv5 msg" +#define ANOTHER_MSG_TYPE 26538 + +struct msgque_data { + key_t key; + int msq_id; + int qbytes; + int qnum; + int mode; + struct msg1 *messages; +}; + +int restore_queue(struct msgque_data *msgque) +{ + int fd, ret, id, i; + char buf[32]; + + fd = open("/proc/sys/kernel/msg_next_id", O_WRONLY); + if (fd == -1) { + printf("Failed to open /proc/sys/kernel/msg_next_id\n"); + return -errno; + } + sprintf(buf, "%d", msgque->msq_id); + + ret = write(fd, buf, strlen(buf)); + if (ret != strlen(buf)) { + printf("Failed to write to /proc/sys/kernel/msg_next_id\n"); + return -errno; + } + + id = msgget(msgque->key, msgque->mode | IPC_CREAT | IPC_EXCL); + if (id == -1) { + printf("Failed to create queue\n"); + return -errno; + } + + if (id != msgque->msq_id) { + printf("Restored queue has wrong id (%d instead of %d)\n", + id, msgque->msq_id); + ret = -EFAULT; + goto destroy; + } + + for (i = 0; i < msgque->qnum; i++) { + if (msgsnd(msgque->msq_id, &msgque->messages[i].mtype, + msgque->messages[i].msize, IPC_NOWAIT) != 0) { + printf("msgsnd failed (%m)\n"); + ret = -errno; + goto destroy; + }; + } + return 0; + +destroy: + if (msgctl(id, IPC_RMID, 0)) + printf("Failed to destroy queue: %d\n", -errno); + return ret; +} + +int check_and_destroy_queue(struct msgque_data *msgque) +{ + struct msg1 message; + int cnt = 0, ret; + + while (1) { + ret = msgrcv(msgque->msq_id, &message.mtype, MAX_MSG_SIZE, + 0, IPC_NOWAIT); + if (ret < 0) { + if (errno == ENOMSG) + break; + printf("Failed to read IPC message: %m\n"); + ret = -errno; + goto err; + } + if (ret != msgque->messages[cnt].msize) { + printf("Wrong message size: %d (expected %d)\n", ret, + msgque->messages[cnt].msize); + ret = -EINVAL; + goto err; + } + if (message.mtype != msgque->messages[cnt].mtype) { + printf("Wrong message type\n"); + ret = -EINVAL; + goto err; + } + if (memcmp(message.mtext, msgque->messages[cnt].mtext, ret)) { + printf("Wrong message content\n"); + ret = -EINVAL; + goto err; + } + cnt++; + } + + if (cnt != msgque->qnum) { + printf("Wrong message number\n"); + ret = -EINVAL; + goto err; + } + + ret = 0; +err: + if (msgctl(msgque->msq_id, IPC_RMID, 0)) { + printf("Failed to destroy queue: %d\n", -errno); + return -errno; + } + return ret; +} + +int dump_queue(struct msgque_data *msgque) +{ + struct msqid64_ds ds; + int kern_id; + int i, ret; + + for (kern_id = 0; kern_id < 256; kern_id++) { + ret = msgctl(kern_id, MSG_STAT, &ds); + if (ret < 0) { + if (errno == -EINVAL) + continue; + printf("Failed to get stats for IPC queue with id %d\n", + kern_id); + return -errno; + } + + if (ret == msgque->msq_id) + break; + } + + msgque->messages = malloc(sizeof(struct msg1) * ds.msg_qnum); + if (msgque->messages == NULL) { + printf("Failed to get stats for IPC queue\n"); + return -ENOMEM; + } + + msgque->qnum = ds.msg_qnum; + msgque->mode = ds.msg_perm.mode; + msgque->qbytes = ds.msg_qbytes; + + for (i = 0; i < msgque->qnum; i++) { + ret = msgrcv(msgque->msq_id, &msgque->messages[i].mtype, + MAX_MSG_SIZE, i, IPC_NOWAIT | MSG_COPY); + if (ret < 0) { + printf("Failed to copy IPC message: %m (%d)\n", errno); + return -errno; + } + msgque->messages[i].msize = ret; + } + return 0; +} + +int fill_msgque(struct msgque_data *msgque) +{ + struct msg1 msgbuf; + + msgbuf.mtype = MSG_TYPE; + memcpy(msgbuf.mtext, TEST_STRING, sizeof(TEST_STRING)); + if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(TEST_STRING), + IPC_NOWAIT) != 0) { + printf("First message send failed (%m)\n"); + return -errno; + }; + + msgbuf.mtype = ANOTHER_MSG_TYPE; + memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING)); + if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(ANOTHER_TEST_STRING), + IPC_NOWAIT) != 0) { + printf("Second message send failed (%m)\n"); + return -errno; + }; + return 0; +} + +int main(int argc, char **argv) +{ + int msg, pid, err; + struct msgque_data msgque; + + msgque.key = ftok(argv[0], 822155650); + if (msgque.key == -1) { + printf("Can't make key\n"); + return -errno; + } + + msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666); + if (msgque.msq_id == -1) { + printf("Can't create queue\n"); + goto err_out; + } + + err = fill_msgque(&msgque); + if (err) { + printf("Failed to fill queue\n"); + goto err_destroy; + } + + err = dump_queue(&msgque); + if (err) { + printf("Failed to dump queue\n"); + goto err_destroy; + } + + err = check_and_destroy_queue(&msgque); + if (err) { + printf("Failed to check and destroy queue\n"); + goto err_out; + } + + err = restore_queue(&msgque); + if (err) { + printf("Failed to restore queue\n"); + goto err_destroy; + } + + err = check_and_destroy_queue(&msgque); + if (err) { + printf("Failed to test queue\n"); + goto err_out; + } + return 0; + +err_destroy: + if (msgctl(msgque.msq_id, IPC_RMID, 0)) { + printf("Failed to destroy queue: %d\n", -errno); + return -errno; + } +err_out: + return err; +} -- cgit v1.2.3 From a458431e176ddb27e8ef8b98c2a681b217337393 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 4 Jan 2013 15:35:08 -0800 Subject: mm: fix zone_watermark_ok_safe() accounting of isolated pages Commit 702d1a6e0766 ("memory-hotplug: fix kswapd looping forever problem") added an isolated pageblocks counter (nr_pageblock_isolate in struct zone) and used it to adjust free pages counter in zone_watermark_ok_safe() to prevent kswapd looping forever problem. Then later, commit 2139cbe627b8 ("cma: fix counting of isolated pages") fixed accounting of isolated pages in global free pages counter. It made the previous zone_watermark_ok_safe() fix unnecessary and potentially harmful (cause now isolated pages may be accounted twice making free pages counter incorrect). This patch removes the special isolated pageblocks counter altogether which fixes zone_watermark_ok_safe() free pages check. Reported-by: Tomasz Stanislawski Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Kyungmin Park Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Aaditya Kumar Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 -------- mm/page_alloc.c | 27 --------------------------- mm/page_isolation.c | 26 ++------------------------ 3 files changed, 2 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4bec5be82cab..73b64a38b984 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -503,14 +503,6 @@ struct zone { * rarely used fields: */ const char *name; -#ifdef CONFIG_MEMORY_ISOLATION - /* - * the number of MIGRATE_ISOLATE *pageblock*. - * We need this for free page counting. Look at zone_watermark_ok_safe. - * It's protected by zone->lock - */ - int nr_pageblock_isolate; -#endif } ____cacheline_internodealigned_in_smp; typedef enum { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37127fc..bc6cc0e913bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -221,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -/* - * NOTE: - * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. - * Instead, use {un}set_pageblock_isolate. - */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -1655,20 +1650,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -#ifdef CONFIG_MEMORY_ISOLATION -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - if (unlikely(zone->nr_pageblock_isolate)) - return zone->nr_pageblock_isolate * pageblock_nr_pages; - return 0; -} -#else -static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) -{ - return 0; -} -#endif - bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1684,14 +1665,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - /* - * If the zone has MIGRATE_ISOLATE type free pages, we should consider - * it. nr_zone_isolate_freepages is never accurate so kswapd might not - * sleep although it could do so. But this is more desirable for memory - * hotplug than sleeping which can cause a livelock in the direct - * reclaim path. - */ - free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 9d2264ea4606..383bdbb98b04 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,28 +8,6 @@ #include #include "internal.h" -/* called while holding zone->lock */ -static void set_pageblock_isolate(struct page *page) -{ - if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) - return; - - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - page_zone(page)->nr_pageblock_isolate++; -} - -/* called while holding zone->lock */ -static void restore_pageblock_isolate(struct page *page, int migratetype) -{ - struct zone *zone = page_zone(page); - if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) - return; - - BUG_ON(zone->nr_pageblock_isolate <= 0); - set_pageblock_migratetype(page, migratetype); - zone->nr_pageblock_isolate--; -} - int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -80,7 +58,7 @@ out: unsigned long nr_pages; int migratetype = get_pageblock_migratetype(page); - set_pageblock_isolate(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -103,7 +81,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) goto out; nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); - restore_pageblock_isolate(page, migratetype); + set_pageblock_migratetype(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v1.2.3 From 53a59fc67f97374758e63a9c785891ec62324c81 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 4 Jan 2013 15:35:12 -0800 Subject: mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT Since commit e303297e6c3a ("mm: extended batches for generic mmu_gather") we are batching pages to be freed until either tlb_next_batch cannot allocate a new batch or we are done. This works just fine most of the time but we can get in troubles with non-preemptible kernel (CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY) on large machines where too aggressive batching might lead to soft lockups during process exit path (exit_mmap) because there are no scheduling points down the free_pages_and_swap_cache path and so the freeing can take long enough to trigger the soft lockup. The lockup is harmless except when the system is setup to panic on softlockup which is not that unusual. The simplest way to work around this issue is to limit the maximum number of batches in a single mmu_gather. 10k of collected pages should be safe to prevent from soft lockups (we would have 2ms for one) even if they are all freed without an explicit scheduling point. This patch doesn't add any new explicit scheduling points because it relies on zap_pmd_range during page tables zapping which calls cond_resched per PMD. The following lockup has been reported for 3.0 kernel with a huge process (in order of hundreds gigs but I do know any more details). BUG: soft lockup - CPU#56 stuck for 22s! [kernel:31053] Modules linked in: af_packet nfs lockd fscache auth_rpcgss nfs_acl sunrpc mptctl mptbase autofs4 binfmt_misc dm_round_robin dm_multipath bonding cpufreq_conservative cpufreq_userspace cpufreq_powersave pcc_cpufreq mperf microcode fuse loop osst sg sd_mod crc_t10dif st qla2xxx scsi_transport_fc scsi_tgt netxen_nic i7core_edac iTCO_wdt joydev e1000e serio_raw pcspkr edac_core iTCO_vendor_support acpi_power_meter rtc_cmos hpwdt hpilo button container usbhid hid dm_mirror dm_region_hash dm_log linear uhci_hcd ehci_hcd usbcore usb_common scsi_dh_emc scsi_dh_alua scsi_dh_hp_sw scsi_dh_rdac scsi_dh dm_snapshot pcnet32 mii edd dm_mod raid1 ext3 mbcache jbd fan thermal processor thermal_sys hwmon cciss scsi_mod Supported: Yes CPU 56 Pid: 31053, comm: kernel Not tainted 3.0.31-0.9-default #1 HP ProLiant DL580 G7 RIP: 0010: _raw_spin_unlock_irqrestore+0x8/0x10 RSP: 0018:ffff883ec1037af0 EFLAGS: 00000206 RAX: 0000000000000e00 RBX: ffffea01a0817e28 RCX: ffff88803ffd9e80 RDX: 0000000000000200 RSI: 0000000000000206 RDI: 0000000000000206 RBP: 0000000000000002 R08: 0000000000000001 R09: ffff887ec724a400 R10: 0000000000000000 R11: dead000000200200 R12: ffffffff8144c26e R13: 0000000000000030 R14: 0000000000000297 R15: 000000000000000e FS: 00007ed834282700(0000) GS:ffff88c03f200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 000000000068b240 CR3: 0000003ec13c5000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kernel (pid: 31053, threadinfo ffff883ec1036000, task ffff883ebd5d4100) Call Trace: release_pages+0xc5/0x260 free_pages_and_swap_cache+0x9d/0xc0 tlb_flush_mmu+0x5c/0x80 tlb_finish_mmu+0xe/0x50 exit_mmap+0xbd/0x120 mmput+0x49/0x120 exit_mm+0x122/0x160 do_exit+0x17a/0x430 do_group_exit+0x3d/0xb0 get_signal_to_deliver+0x247/0x480 do_signal+0x71/0x1b0 do_notify_resume+0x98/0xb0 int_signal+0x12/0x17 DWARF2 unwinder stuck at int_signal+0x12/0x17 Signed-off-by: Michal Hocko Cc: [3.0+] Cc: Mel Gorman Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 9 +++++++++ mm/memory.c | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index ed6642ad03e0..25f01d0bc149 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -78,6 +78,14 @@ struct mmu_gather_batch { #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) +/* + * Limit the maximum number of mmu_gather batches to reduce a risk of soft + * lockups for non-preemptible kernels on huge machines when a lot of memory + * is zapped during unmapping. + * 10K pages freed at once should be safe even without a preemption point. + */ +#define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) + /* struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ @@ -96,6 +104,7 @@ struct mmu_gather { struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; + unsigned int batch_count; }; #define HAVE_GENERIC_MMU_GATHER diff --git a/mm/memory.c b/mm/memory.c index e0a9b0ce4f10..49fb1cf08611 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -184,10 +184,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -216,6 +220,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; -- cgit v1.2.3