From 77b92f5b13c7f025ad46dbbf3feccd85b2954feb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:39:40 -0700 Subject: [PATCH] Fix URLs in Kconfig files From: Rusty Russell From: "Petri T. Koistinen" 1) Various URLs in the Kconfig files are out of date: update them. 2) URLs should be of form . 3) References to files in the source should be of form 4) Email addresses should be of form --- kernel/power/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 033eea403f26..68bc8a8603ea 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -9,9 +9,9 @@ config PM Power Management is most important for battery powered laptop computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at - and the - Battery Powered Linux mini-HOWTO, available from + page on the WWW at or + Tuxmobil - Linux on Mobile Computers at + and the Battery Powered Linux mini-HOWTO, available from . Note that, even if you say N here, Linux on the x86 architecture -- cgit v1.2.3 From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:40:05 -0700 Subject: [PATCH] generalise system_running From: Olof Johansson It's currently a boolean, but that means that system_running goes to zero again when shutting down. So we then use code (in the page allocator) which is only designed to be used during bootup - it is marked __init. So we need to be able to distinguish early boot state from late shutdown state. Rename system_running to system_state and give it the three appropriate states. --- arch/ppc/platforms/pmac_nvram.c | 8 ++++---- include/linux/kernel.h | 8 +++++++- init/main.c | 8 ++------ kernel/kmod.c | 2 +- kernel/printk.c | 3 ++- kernel/sched.c | 3 ++- kernel/sys.c | 8 ++++---- mm/page_alloc.c | 2 +- 8 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c index f381f3f745f9..3b3f984fb929 100644 --- a/arch/ppc/platforms/pmac_nvram.c +++ b/arch/ppc/platforms/pmac_nvram.c @@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM, (addr >> 8) & 0xff, addr & 0xff)) return 0xff; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); @@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM, (addr >> 8) & 0xff, addr & 0xff, val)) return; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e11e79199357..c1171e77c76b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -109,9 +109,15 @@ static inline void console_verbose(void) extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_on_oops; -extern int system_running; +extern int system_state; /* See values below */ extern int tainted; extern const char *print_tainted(void); + +/* Values used for system_state */ +#define SYSTEM_BOOTING 0 +#define SYSTEM_RUNNING 1 +#define SYSTEM_SHUTDOWN 2 + #define TAINT_PROPRIETARY_MODULE (1<<0) #define TAINT_FORCED_MODULE (1<<1) #define TAINT_UNSAFE_SMP (1<<2) diff --git a/init/main.c b/init/main.c index 9d1ed1de14c5..348ce7db30f3 100644 --- a/init/main.c +++ b/init/main.c @@ -94,11 +94,7 @@ extern void driver_init(void); extern void tc_init(void); #endif -/* - * Are we up and running (ie do we have all the infrastructure - * set up) - */ -int system_running; +int system_state; /* SYSTEM_BOOTING/RUNNING/SHUTDOWN */ /* * Boot command-line arguments @@ -613,7 +609,7 @@ static int init(void * unused) */ free_initmem(); unlock_kernel(); - system_running = 1; + system_state = SYSTEM_RUNNING; if (sys_open("/dev/console", O_RDWR, 0) < 0) printk("Warning: unable to open an initial console.\n"); diff --git a/kernel/kmod.c b/kernel/kmod.c index 5261de82029b..0002fcd4c554 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); - if (!system_running) + if (system_state != SYSTEM_RUNNING) return -EBUSY; if (path[0] == '\0') diff --git a/kernel/printk.c b/kernel/printk.c index a7be1f922f34..5f2b3c9bbd6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && !system_running) { + if (!cpu_online(smp_processor_id()) && + system_state != SYSTEM_RUNNING) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff --git a/kernel/sched.c b/kernel/sched.c index d5f21712ffbb..9e19d4c0d4a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line) #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && system_running) { + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/kernel/sys.c b/kernel/sys.c index 33a14e13079e..bc498b12edcc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user switch (cmd) { case LINUX_REBOOT_CMD_RESTART: notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system.\n"); machine_restart(NULL); @@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user buffer[sizeof(buffer) - 1] = '\0'; notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d035d836c15..9764a4e78e45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde struct page * page; #ifdef CONFIG_NUMA - if (unlikely(!system_running)) + if (unlikely(system_state == SYSTEM_BOOTING)) return get_boot_pages(gfp_mask, order); #endif page = alloc_pages(gfp_mask, order); -- cgit v1.2.3 From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:20 -0700 Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions From: William Lee Irwin III This addresses the issue with get_wchan() that the various functions acting as scheduling-related primitives are not, in fact, contiguous in the text segment. It creates an ELF section for scheduling primitives to be placed in, and places currently-detected (i.e. skipped during stack decoding) scheduling primitives and others like io_schedule() and down(), which are currently missed by get_wchan() code, into this section also. The net effects are more reliability of get_wchan()'s results and the new ability, made use of by this code, to arbitrarily place scheduling primitives in the source code without disturbing get_wchan()'s accuracy. Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the invasiveness of the patch were incorporated during prior rounds of review. I've at least tried to sweep all arches in this patch. --- arch/alpha/kernel/process.c | 2 -- arch/alpha/kernel/semaphore.c | 9 ++++---- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/process.c | 2 -- arch/arm/kernel/semaphore.c | 8 ++++--- arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm26/kernel/process.c | 2 -- arch/arm26/kernel/semaphore.c | 8 ++++--- arch/arm26/kernel/vmlinux-arm26-xip.lds.in | 1 + arch/arm26/kernel/vmlinux-arm26.lds.in | 1 + arch/cris/arch-v10/kernel/process.c | 3 +-- arch/cris/arch-v10/vmlinux.lds.S | 1 + arch/cris/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/process.c | 3 --- arch/h8300/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/vmlinux.lds.S | 1 + arch/i386/kernel/process.c | 2 -- arch/i386/kernel/semaphore.c | 17 +++++++------- arch/i386/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/process.c | 2 -- arch/ia64/kernel/semaphore.c | 7 +++--- arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/process.c | 5 ---- arch/m68k/kernel/semaphore.c | 5 ++-- arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/m68knommu/kernel/process.c | 5 ---- arch/m68knommu/kernel/semaphore.c | 5 ++-- arch/m68knommu/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/process.c | 2 -- arch/mips/kernel/semaphore.c | 5 ++-- arch/mips/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/semaphore.c | 5 ++-- arch/parisc/kernel/vmlinux.lds.S | 1 + arch/ppc/kernel/process.c | 2 -- arch/ppc/kernel/semaphore.c | 5 ++-- arch/ppc/kernel/vmlinux.lds.S | 1 + arch/ppc64/kernel/process.c | 2 -- arch/ppc64/kernel/semaphore.c | 5 ++-- arch/ppc64/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/process.c | 2 -- arch/s390/kernel/semaphore.c | 5 ++-- arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/process.c | 4 +--- arch/sh/kernel/semaphore.c | 5 ++-- arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/process.c | 4 +--- arch/sparc/kernel/semaphore.c | 5 ++-- arch/sparc/kernel/vmlinux.lds.S | 1 + arch/sparc/lib/rwsem.S | 3 ++- arch/sparc64/kernel/process.c | 4 +--- arch/sparc64/kernel/semaphore.c | 9 ++++---- arch/sparc64/kernel/vmlinux.lds.S | 1 + arch/sparc64/lib/rwsem.c | 5 ++-- arch/v850/kernel/process.c | 3 --- arch/v850/kernel/semaphore.c | 5 ++-- arch/v850/kernel/vmlinux.lds.S | 1 + arch/x86_64/kernel/process.c | 2 -- arch/x86_64/kernel/semaphore.c | 5 ++-- arch/x86_64/kernel/vmlinux.lds.S | 1 + arch/x86_64/lib/thunk.S | 3 ++- include/asm-generic/vmlinux.lds.h | 5 ++++ include/linux/init.h | 2 ++ include/linux/sched.h | 2 ++ kernel/sched.c | 37 ++++++++++++++++-------------- kernel/timer.c | 4 ++-- lib/rwsem.c | 5 ++-- 67 files changed, 137 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e427bae12ffe..297e4b48bfe2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -513,8 +513,6 @@ thread_saved_pc(task_t *t) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c index b52a0df303fe..4d60a0ccd6f7 100644 --- a/arch/alpha/kernel/semaphore.c +++ b/arch/alpha/kernel/semaphore.c @@ -7,6 +7,7 @@ #include #include +#include /* * This is basically the PPC semaphore scheme ported to use @@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr) * Either form may be used in conjunction with "up()". */ -void +void __sched __down_failed(struct semaphore *sem) { struct task_struct *tsk = current; @@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem) #endif } -int +int __sched __down_failed_interruptible(struct semaphore *sem) { struct task_struct *tsk = current; @@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem) wake_up(&sem->wait); } -void +void __sched down(struct semaphore *sem) { #if WAITQUEUE_DEBUG @@ -173,7 +174,7 @@ down(struct semaphore *sem) __down(sem); } -int +int __sched down_interruptible(struct semaphore *sem) { #if WAITQUEUE_DEBUG diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 7afd00d5d46b..d159b8f0d022 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -17,6 +17,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } :kernel diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 863c4076daad..8423921e821a 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c index a50902e8bec7..da39eb3dca31 100644 --- a/arch/arm/kernel/semaphore.c +++ b/arch/arm/kernel/semaphore.c @@ -13,6 +13,7 @@ */ #include #include +#include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -87,7 +88,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 56af3401b34d..a5db0ddca6a4 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -73,6 +73,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c index 09a2f52ad8a8..ce23571617a1 100644 --- a/arch/arm26/kernel/process.c +++ b/arch/arm26/kernel/process.c @@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c index e7964ce1d0d9..60591a738592 100644 --- a/arch/arm26/kernel/semaphore.c +++ b/arch/arm26/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in index 602a77c022d7..61eedf0bc42f 100644 --- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in @@ -66,6 +66,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in index 8782fe36f0a8..2393f3805a49 100644 --- a/arch/arm26/kernel/vmlinux-arm26.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26.lds.in @@ -67,6 +67,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 62e3a4fbf33a..c785b54e6cbd 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_ETRAX_GPIO void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */ @@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp, * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S index b2c27e147f29..6b73a2c0dad8 100644 --- a/arch/cris/arch-v10/vmlinux.lds.S +++ b/arch/cris/arch-v10/vmlinux.lds.S @@ -25,6 +25,7 @@ SECTIONS __stext = .; .text : { *(.text) + SCHED_TEXT *(.fixup) *(.text.__*) } diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c index d62b355e1706..b884263d3cd4 100644 --- a/arch/cris/kernel/semaphore.c +++ b/arch/cris/kernel/semaphore.c @@ -4,6 +4,7 @@ */ #include +#include #include /* @@ -94,7 +95,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -104,7 +105,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index bd6ccd542399..8640ea20dba0 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -264,8 +264,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/h8300/kernel/semaphore.c +++ b/arch/h8300/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 60787f07eb2b..3a643954a8fe 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -82,6 +82,7 @@ SECTIONS #endif __stext = . ; *(.text) + SCHED_TEXT . = ALIGN(0x4) ; *(.exit.text) *(.text.*) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3495f1aedf67..7fed9d3823ed 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -632,8 +632,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) #define top_esp (THREAD_SIZE - sizeof(unsigned long)) diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c index 5acd544f0cbd..073912cfcf44 100644 --- a/arch/i386/kernel/semaphore.c +++ b/arch/i386/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem) wake_up(&sem->wait); } -asmlinkage void __down(struct semaphore * sem) +asmlinkage void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -asmlinkage int __down_interruptible(struct semaphore * sem) +asmlinkage int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem) * value.. */ asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed\n" "__down_failed:\n\t" @@ -210,7 +211,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" @@ -231,7 +232,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_trylock\n" "__down_failed_trylock:\n\t" @@ -252,7 +253,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __up_wakeup\n" "__up_wakeup:\n\t" @@ -271,7 +272,7 @@ asm( */ #if defined(CONFIG_SMP) asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __write_lock_failed\n" "__write_lock_failed:\n\t" @@ -285,7 +286,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __read_lock_failed\n" "__read_lock_failed:\n\t" diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 3623d7e2934a..0253c586547b 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -16,6 +16,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a1d09d5c91c4..0d245cbcd1f6 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -660,8 +660,6 @@ get_wchan (struct task_struct *p) /* * These bracket the sleeping functions.. */ - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); # define first_sched ((unsigned long) scheduling_functions_start_here) # define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c index f3926a3c4d73..2724ef3fbae2 100644 --- a/arch/ia64/kernel/semaphore.c +++ b/arch/ia64/kernel/semaphore.c @@ -24,6 +24,7 @@ * where we want to avoid any extra jumps and calls. */ #include +#include #include #include @@ -44,8 +45,7 @@ __up (struct semaphore *sem) wake_up(&sem->wait); } -void -__down (struct semaphore *sem) +void __sched __down (struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,8 +82,7 @@ __down (struct semaphore *sem) tsk->state = TASK_RUNNING; } -int -__down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index e5589e49d9da..5c45718a9c82 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -41,6 +41,7 @@ SECTIONS { *(.text.ivt) *(.text) + SCHED_TEXT *(.gnu.linkonce.t*) } .text2 : AT(ADDR(.text2) - LOAD_OFFSET) diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 8d72a5c5b0c7..fc2c753c332b 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ if (sw->retpc > (unsigned long)scheduling_functions_start_here && @@ -387,8 +385,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/m68k/kernel/semaphore.c +++ b/arch/m68k/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index bd41fc992169..6dc62684c7b9 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -12,6 +12,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 2e81cde14987..f293e567192c 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -13,6 +13,7 @@ SECTIONS .text : { *(.head) *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index c8b87371641a..896d596a1bd8 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -406,8 +406,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; @@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p) */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c index 33d704fcf883..c083f4772add 100644 --- a/arch/m68knommu/kernel/semaphore.c +++ b/arch/m68knommu/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -96,7 +97,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -107,7 +108,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 1ab8a31ef964..a362870b6e4e 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -191,6 +191,7 @@ SECTIONS { .text : { _stext = . ; *(.text) + SCHED_TEXT *(.text.lock) . = ALIGN(16); /* Exception table */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index f8ba26770bf4..f4ab9c66b27f 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c index 11b937f20604..51c3e772c029 100644 --- a/arch/mips/kernel/semaphore.c +++ b/arch/mips/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifdef CONFIG_CPU_HAS_LLDSCD @@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem) * Either form may be used in conjunction with "up()". */ -void __down_failed(struct semaphore * sem) +void __sched __down_failed(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; @@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem, #endif /* !CONFIG_CPU_HAS_LLDSCD */ -int __down_failed_interruptible(struct semaphore * sem) +int __sched __down_failed_interruptible(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index b72639f8db65..098cfaa23c0e 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } =0 diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c index ffb4851451fc..ee806bcc3726 100644 --- a/arch/parisc/kernel/semaphore.c +++ b/arch/parisc/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include #include +#include /* * Semaphores are complex as we wish to avoid using two variables. @@ -58,7 +59,7 @@ void __up(struct semaphore *sem) sem->count += (sem->count < 0) ? 1 : - 1; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_HEAD @@ -74,7 +75,7 @@ void __down(struct semaphore * sem) UPDATE_COUNT } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DOWN_HEAD diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 14d0882a19d2..e5d5aeef96e5 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text ALIGN(16) : { *(.text*) + SCHED_TEXT *(.PARISC.unwind) *(.fixup) *(.lock.text) /* out-of-line lock text */ diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c index ada32baeda19..3363a030e00f 100644 --- a/arch/ppc/kernel/process.c +++ b/arch/ppc/kernel/process.c @@ -661,8 +661,6 @@ void __init ll_puts(const char *s) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c index 7bf51fba5c14..2fe429b27c14 100644 --- a/arch/ppc/kernel/semaphore.c +++ b/arch/ppc/kernel/semaphore.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -69,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index 81b95d449a22..b710d55c5b08 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS .text : { *(.text) + SCHED_TEXT *(.fixup) *(.got1) __got2_start = .; diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index cec7225a6ac1..f74b14d7e58e 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched (*(unsigned long *)scheduling_functions_start_here) #define last_sched (*(unsigned long *)scheduling_functions_end_here) diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c index c977029e2465..d723632d59f3 100644 --- a/arch/ppc64/kernel/semaphore.c +++ b/arch/ppc64/kernel/semaphore.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -70,7 +71,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S index a8531b1f9ef2..1d9b61143aaa 100644 --- a/arch/ppc64/kernel/vmlinux.lds.S +++ b/arch/ppc64/kernel/vmlinux.lds.S @@ -13,6 +13,7 @@ SECTIONS /* Read-only sections, merged into text segment: */ .text : { *(.text .text.*) + SCHED_TEXT *(.fixup) . = ALIGN(4096); _etext = .; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3676307d1d8a..050585ab5d2a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c index 8203f5e0228d..8dfb690c159f 100644 --- a/arch/s390/kernel/semaphore.c +++ b/arch/s390/kernel/semaphore.c @@ -11,6 +11,7 @@ */ #include #include +#include #include @@ -60,7 +61,7 @@ void __up(struct semaphore *sem) * count > 0: decrement count, wake up queue and exit. * count <= 0: set count to -1, go to sleep. */ -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +83,7 @@ void __down(struct semaphore * sem) * count > 0: wake up queue and exit. * count <= 0: set count to 0, wake up queue and exit. */ -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index c9ca7a8e93b3..b4534b2867c3 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -23,6 +23,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0700 diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 773006661b50..7d45ea0acd09 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -464,8 +464,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p) * The same comment as on the Alpha applies here, too ... */ pc = thread_saved_pc(p); - if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) { + if (pc >= first_sched && pc < last_sched) { schedule_frame = ((unsigned long *)(long)p->thread.sp)[1]; return (unsigned long)((unsigned long *)schedule_frame)[1]; } diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c index 0943ad666a67..a3c24dcbf01d 100644 --- a/arch/sh/kernel/semaphore.c +++ b/arch/sh/kernel/semaphore.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -103,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -113,7 +114,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 2cc86534c130..da0f5d728b3e 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -22,6 +22,7 @@ SECTIONS } = 0 .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0009 diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index beae70a970e4..70261b211997 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) return retval; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c index 5a8f3d176a8f..77e63b92ca30 100644 --- a/arch/sparc/kernel/semaphore.c +++ b/arch/sparc/kernel/semaphore.c @@ -4,6 +4,7 @@ #include #include +#include #include @@ -45,7 +46,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -78,7 +79,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 0862360d865d..8d4bbfaf304c 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -12,6 +12,7 @@ SECTIONS .text 0xf0004000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S index 98b757cb67c6..e7578dc600b8 100644 --- a/arch/sparc/lib/rwsem.S +++ b/arch/sparc/lib/rwsem.S @@ -8,7 +8,7 @@ #include #include - .text + .section .sched.text .align 4 .globl ___down_read @@ -113,6 +113,7 @@ ___down_write: ba 2b restore %l5, %g0, %g5 + .text .globl ___up_read ___up_read: rd %psr, %g3 diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 1be2b97e4672..0caf962e8155 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -823,9 +824,6 @@ out: return error; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index a9e66d666ceb..9ddfcb9a1900 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -8,6 +8,7 @@ #include #include +#include /* * Atomically update sem->count. @@ -90,7 +91,7 @@ void up(struct semaphore *sem) : "g5", "g7", "memory", "cc"); } -static void __down(struct semaphore * sem) +static void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -108,7 +109,7 @@ static void __down(struct semaphore * sem) wake_up(&sem->wait); } -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { might_sleep(); /* This atomically does: @@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem) return ret; } -static int __down_interruptible(struct semaphore * sem) +static int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem) return retval; } -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { int ret = 0; diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S index ad95e88a3cbc..8faeee09fab2 100644 --- a/arch/sparc64/kernel/vmlinux.lds.S +++ b/arch/sparc64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS .text 0x0000000000404000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c index 8e1dfdda91fa..e19968dbc2d1 100644 --- a/arch/sparc64/lib/rwsem.c +++ b/arch/sparc64/lib/rwsem.c @@ -6,6 +6,7 @@ #include #include +#include #include extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); @@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *)); -void __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_read\n" @@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem) } EXPORT_SYMBOL(__down_read_trylock); -void __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_write\n\t" diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c index 5c29ae51a303..977d75772d81 100644 --- a/arch/v850/kernel/process.c +++ b/arch/v850/kernel/process.c @@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here (void); -extern void scheduling_functions_end_here (void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c index b78d714384db..2d20886863d8 100644 --- a/arch/v850/kernel/semaphore.c +++ b/arch/v850/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S index 028c224fa66a..07ab0f292d1c 100644 --- a/arch/v850/kernel/vmlinux.lds.S +++ b/arch/v850/kernel/vmlinux.lds.S @@ -64,6 +64,7 @@ #define TEXT_CONTENTS \ __stext = . ; \ *(.text) \ + SCHED_TEXT *(.exit.text) /* 2.5 convention */ \ *(.text.exit) /* 2.4 convention */ \ *(.text.lock) \ diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 7b2414765ca3..d1d9471581a8 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c index 5e517814dd07..2bcd4a7ec38d 100644 --- a/arch/x86_64/kernel/semaphore.c +++ b/arch/x86_64/kernel/semaphore.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7b9e1beb360e..c612e4d213a1 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 876cb937f9f1..acc1e2ca7ed7 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -35,6 +35,7 @@ .endm + .section .sched.text #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed @@ -65,7 +66,7 @@ restore_norax: #ifdef CONFIG_SMP /* Support for read/write spinlocks. */ - + .text /* rax: pointer to rwlock_t */ ENTRY(__write_lock_failed) lock diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 59c2b950e8b8..a4b6c768cf49 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -51,3 +51,8 @@ *(.security_initcall.init) \ __security_initcall_end = .; \ } + +#define SCHED_TEXT \ + __scheduling_functions_start_here = .; \ + *(.sched.text) \ + __scheduling_functions_end_here = .; diff --git a/include/linux/init.h b/include/linux/init.h index 45069e275b3d..c6842477243c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -46,6 +46,8 @@ #define __exitdata __attribute__ ((__section__(".exit.data"))) #define __exit_call __attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) +#define __sched __attribute__((__section__(".sched.text"))) + #ifdef MODULE #define __exit __attribute__ ((__section__(".exit.text"))) #else diff --git a/include/linux/sched.h b/include/linux/sched.h index f5fa0c07a7f8..054b3c0d5962 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user, unsigned long system, int cpu); extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; +extern const unsigned long scheduling_functions_start_here; +extern const unsigned long scheduling_functions_end_here; #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/kernel/sched.c b/kernel/sched.c index 9e19d4c0d4a9..b42029abe679 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +extern unsigned long __scheduling_functions_start_here; +extern unsigned long __scheduling_functions_end_here; +const unsigned long scheduling_functions_start_here = + (unsigned long)&__scheduling_functions_start_here; +const unsigned long scheduling_functions_end_here = + (unsigned long)&__scheduling_functions_end_here; + /* * Default context-switch locking: */ @@ -1587,12 +1594,10 @@ out: rebalance_tick(rq, 0); } -void scheduling_functions_start_here(void) { } - /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; @@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule); * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void preempt_schedule(void) +asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } -void fastcall wait_for_completion(struct completion *x) +void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion); __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); -void fastcall interruptible_sleep_on(wait_queue_head_t *q) +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall sleep_on(wait_queue_head_t *q) +void fastcall __sched sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(sleep_on); -long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); -void scheduling_functions_end_here(void) { } - void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __cond_resched(void) +void __sched __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); @@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched); * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void yield(void) +void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); @@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void io_schedule(void) +void __sched io_schedule(void) { struct runqueue *rq = this_rq(); @@ -2464,7 +2467,7 @@ void io_schedule(void) EXPORT_SYMBOL(io_schedule); -long io_schedule_timeout(long timeout) +long __sched io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; @@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep); * * Called inside preempt_disable(). */ -void __preempt_spin_lock(spinlock_t *lock) +void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); @@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock) EXPORT_SYMBOL(__preempt_spin_lock); -void __preempt_write_lock(rwlock_t *lock) +void __sched __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); diff --git a/kernel/timer.c b/kernel/timer.c index f53e0749b0d2..cbcb5522866d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -fastcall signed long schedule_timeout(signed long timeout) +fastcall signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; @@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long nanosleep_restart(struct restart_block *restart) +static long __sched nanosleep_restart(struct restart_block *restart) { unsigned long expire = restart->arg0, now = jiffies; struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; diff --git a/lib/rwsem.c b/lib/rwsem.c index 95469d7fb796..85dcae7e9337 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -5,6 +5,7 @@ */ #include #include +#include #include struct rwsem_waiter { @@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore /* * wait for the read lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; @@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; -- cgit v1.2.3 From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:50 -0700 Subject: [PATCH] posix message queues: code move From: Manfred Spraul cleanup of sysv ipc as a preparation for posix message queues: - replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with static inline wrappers. Now the whole ipc/util.c file is only used if CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef. - remove the prototypes for copy_semundo and exit_sem from kernel/fork.c - they belong into a header file. - create a new msgutil.c with the helper functions for message queues. - cleanup the helper functions: run Lindent, add __user tags. --- include/linux/msg.h | 3 -- include/linux/sem.h | 17 ++++++- ipc/Makefile | 4 +- ipc/msg.c | 105 ------------------------------------------- ipc/msgutil.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ ipc/util.c | 19 -------- ipc/util.h | 10 +++++ kernel/fork.c | 4 +- 8 files changed, 155 insertions(+), 134 deletions(-) create mode 100644 ipc/msgutil.c (limited to 'kernel') diff --git a/include/linux/msg.h b/include/linux/msg.h index b235e862a3dd..2c4c6aa643ff 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -74,9 +74,6 @@ struct msg_msg { /* the actual message follows immediately */ }; -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) - /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; diff --git a/include/linux/sem.h b/include/linux/sem.h index b337c509ac29..aaf45764a56e 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -134,7 +134,22 @@ struct sysv_sem { struct sem_undo_list *undo_list; }; -void exit_sem(struct task_struct *p); +#ifdef CONFIG_SYSVIPC + +extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern void exit_sem(struct task_struct *tsk); + +#else +static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} + +static inline void exit_sem(struct task_struct *tsk) +{ + return; +} +#endif #endif /* __KERNEL__ */ diff --git a/ipc/Makefile b/ipc/Makefile index ccc6c64c2493..6cd32a30f03f 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -2,7 +2,5 @@ # Makefile for the linux ipc. # -obj-y := util.o - obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o diff --git a/ipc/msg.c b/ipc/msg.c index 709ff71bf5c1..37e2d3bb17cb 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -51,11 +51,6 @@ struct msg_sender { struct task_struct* tsk; }; -struct msg_msgseg { - struct msg_msgseg* next; - /* the next part of the message follows immediately */ -}; - #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 @@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg) return msg_buildid(id,msq->q_perm.seq); } -static void free_msg(struct msg_msg* msg) -{ - struct msg_msgseg* seg; - - security_msg_msg_free(msg); - - seg = msg->next; - kfree(msg); - while(seg != NULL) { - struct msg_msgseg* tmp = seg->next; - kfree(seg); - seg = tmp; - } -} - -static struct msg_msg* load_msg(void* src, int len) -{ - struct msg_msg* msg; - struct msg_msgseg** pseg; - int err; - int alen; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - - msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL); - if(msg==NULL) - return ERR_PTR(-ENOMEM); - - msg->next = NULL; - msg->security = NULL; - - if (copy_from_user(msg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - - len -= alen; - src = ((char*)src)+alen; - pseg = &msg->next; - while(len > 0) { - struct msg_msgseg* seg; - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL); - if(seg==NULL) { - err=-ENOMEM; - goto out_err; - } - *pseg = seg; - seg->next = NULL; - if(copy_from_user (seg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - pseg = &seg->next; - len -= alen; - src = ((char*)src)+alen; - } - - err = security_msg_msg_alloc(msg); - if (err) - goto out_err; - - return msg; - -out_err: - free_msg(msg); - return ERR_PTR(err); -} - -static int store_msg(void* dest, struct msg_msg* msg, int len) -{ - int alen; - struct msg_msgseg *seg; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - if(copy_to_user (dest, msg+1, alen)) - return -1; - - len -= alen; - dest = ((char*)dest)+alen; - seg = msg->next; - while(len > 0) { - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - if(copy_to_user (dest, seg+1, alen)) - return -1; - len -= alen; - dest = ((char*)dest)+alen; - seg=seg->next; - } - return 0; -} - static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss) { mss->tsk=current; diff --git a/ipc/msgutil.c b/ipc/msgutil.c new file mode 100644 index 000000000000..e48d777de2a3 --- /dev/null +++ b/ipc/msgutil.c @@ -0,0 +1,127 @@ +/* + * linux/ipc/util.c + * Copyright (C) 1999, 2004 Manfred Spraul + * + * This file is released under GNU General Public Licence version 2 or + * (at your option) any later version. + * + * See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "util.h" + +struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +}; + +#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) + +struct msg_msg *load_msg(void __user *src, int len) +{ + struct msg_msg *msg; + struct msg_msgseg **pseg; + int err; + int alen; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + + msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + msg->next = NULL; + msg->security = NULL; + + if (copy_from_user(msg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + + len -= alen; + src = ((char *)src) + alen; + pseg = &msg->next; + while (len > 0) { + struct msg_msgseg *seg; + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, + GFP_KERNEL); + if (seg == NULL) { + err = -ENOMEM; + goto out_err; + } + *pseg = seg; + seg->next = NULL; + if (copy_from_user(seg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + pseg = &seg->next; + len -= alen; + src = ((char *)src) + alen; + } + + err = security_msg_msg_alloc(msg); + if (err) + goto out_err; + + return msg; + +out_err: + free_msg(msg); + return ERR_PTR(err); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + int alen; + struct msg_msgseg *seg; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + if (copy_to_user(dest, msg + 1, alen)) + return -1; + + len -= alen; + dest = ((char *)dest) + alen; + seg = msg->next; + while (len > 0) { + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + if (copy_to_user(dest, seg + 1, alen)) + return -1; + len -= alen; + dest = ((char *)dest) + alen; + seg = seg->next; + } + return 0; +} + +void free_msg(struct msg_msg *msg) +{ + struct msg_msgseg *seg; + + security_msg_msg_free(msg); + + seg = msg->next; + kfree(msg); + while (seg != NULL) { + struct msg_msgseg *tmp = seg->next; + kfree(seg); + seg = tmp; + } +} diff --git a/ipc/util.c b/ipc/util.c index 6d94883edae0..f74c5eef57d0 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -25,8 +25,6 @@ #include #include -#if defined(CONFIG_SYSVIPC) - #include "util.h" /** @@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd) } #endif /* __ia64__ */ - -#else -/* - * Dummy functions when SYSV IPC isn't configured - */ - -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) -{ - return 0; -} - -void exit_sem(struct task_struct *tsk) -{ - return; -} - -#endif /* CONFIG_SYSVIPC */ diff --git a/ipc/util.h b/ipc/util.h index 79c8fc901317..e6434942c097 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -4,6 +4,10 @@ * * ipc helper functions (c) 1999 Manfred Spraul */ + +#ifndef _IPC_UTIL_H +#define _IPC_UTIL_H + #define USHRT_MAX 0xffff #define SEQ_MULTIPLIER (IPCMNI) @@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); #else int ipc_parse_version (int *cmd); #endif + +extern void free_msg(struct msg_msg *msg); +extern struct msg_msg *load_msg(void __user *src, int len); +extern int store_msg(void __user *dest, struct msg_msg *msg, int len); + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index 3b17a249c50d..a1f20cabbdd3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -39,9 +40,6 @@ #include #include -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_sem(struct task_struct *tsk); - /* The idle threads do not count.. * Protected by write_lock_irq(&tasklist_lock) */ -- cgit v1.2.3 From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:03 -0700 Subject: [PATCH] posix message queues: syscall stubs From: Manfred Spraul Add -ENOSYS stubs for the posix message queue syscalls. The API is a direct mapping of the api from the unix spec, with two exceptions: - mq_close() doesn't exist. Message queue file descriptors can be closed with close(). - mq_notify(SIGEV_THREAD) cannot be implemented in the kernel. The kernel returns a pollable file descriptor . User space must poll (or read) this descriptor and call the notifier function if the file descriptor is signaled. --- arch/i386/kernel/entry.S | 9 +++++++++ include/asm-i386/unistd.h | 11 ++++++++++- include/linux/mqueue.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 9 +++++++++ kernel/sys.c | 6 ++++++ 5 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 include/linux/mqueue.h (limited to 'kernel') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3024740ba84c..14e64d3ea25c 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -882,5 +882,14 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_ni_syscall /* sys_mbind */ + .long sys_ni_syscall /* 275 sys_get_mempolicy */ + .long sys_ni_syscall /* sys_set_mempolicy */ + .long sys_mq_open + .long sys_mq_unlink + .long sys_mq_timedsend + .long sys_mq_timedreceive /* 280 */ + .long sys_mq_notify + .long sys_mq_getsetattr syscall_table_size=(.-sys_call_table) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index a2d58a99491e..620a232084f3 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -279,8 +279,17 @@ #define __NR_utimes 271 #define __NR_fadvise64_64 272 #define __NR_vserver 273 +#define __NR_mbind 274 +#define __NR_get_mempolicy 275 +#define __NR_set_mempolicy 276 +#define __NR_mq_open 277 +#define __NR_mq_unlink (__NR_mq_open+1) +#define __NR_mq_timedsend (__NR_mq_open+2) +#define __NR_mq_timedreceive (__NR_mq_open+3) +#define __NR_mq_notify (__NR_mq_open+4) +#define __NR_mq_getsetattr (__NR_mq_open+5) -#define NR_syscalls 274 +#define NR_syscalls 283 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h new file mode 100644 index 000000000000..c0c5fcc89f0e --- /dev/null +++ b/include/linux/mqueue.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + It is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this software; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _LINUX_MQUEUE_H +#define _LINUX_MQUEUE_H + +#define MQ_PRIO_MAX 32768 + +typedef int mqd_t; + +struct mq_attr { + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +#define NOTIFY_NONE 0 +#define NOTIFY_WOKENUP 1 +#define NOTIFY_REMOVED 2 + +#endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aaf87aeacafb..7ee5f67abb5f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,6 +48,8 @@ struct timex; struct timezone; struct tms; struct utimbuf; +typedef int mqd_t; +struct mq_attr; #include #include @@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag); asmlinkage long sys_shmdt(char __user *shmaddr); asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); +asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); +asmlinkage long sys_mq_unlink(const char __user *name); +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); +asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); +asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); + asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn); asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/kernel/sys.c b/kernel/sys.c index bc498b12edcc..7d1bf5c57aca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -260,6 +260,12 @@ cond_syscall(sys_msgctl) cond_syscall(sys_shmget) cond_syscall(sys_shmdt) cond_syscall(sys_shmctl) +cond_syscall(sys_mq_open) +cond_syscall(sys_mq_unlink) +cond_syscall(sys_mq_timedsend) +cond_syscall(sys_mq_timedreceive) +cond_syscall(sys_mq_notify) +cond_syscall(sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From be94d44e818a56406016111fc48a1084b9f8e435 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:16 -0700 Subject: [PATCH] posix message queues: implementation From: Manfred Spraul Actual implementation of the posix message queues, written by Krzysztof Benedyczak and Michal Wronski. The complete implementation is dependant on CONFIG_POSIX_MQUEUE. It passed the openposix test suite with two exceptions: one mq_unlink test was bad and tested undefined behavior. And Linux succeeds mq_close(open(,,,)). The spec mandates EBADF, but we have decided to ignore that: we would have to add a new syscall just for the right error code. The patch intentionally doesn't use all helpers from fs/libfs for kernel-only filesystems: step 5 allows user space mounts of the file system. Signal changes: The patch redefines SI_MESGQ using __SI_CODE: The generic Linux ABI uses a negative value (i.e. from user) for SI_MESGQ, but the kernel internal value must be posive to pass check_kill_value. Additionally, the patch adds support into copy_siginfo_to_user to copy the "new" signal type to user space. Changes in signal code caused by POSIX message queues patch: General & rationale: mqueues generated signals (only upon notification) must have si_code == SI_MESGQ. In fact such a signal is send from one process which caused notification (== sent message to empty message queue) to another which requested it. Both processes can be of course unrelated in terms of uids/euids. So SI_MESGQ signals must be classified as SI_FROMKERNEL to pass check_kill_permissions (not need to say that this signals ARE from kernel). Signals generated by message queues notification need the same fields in siginfo struct's union _sifields as POSIX.1b signals and we can reuse its union entry. SI_MESGQ was previously defined to -3 in kernel and also in glibc. So in userspace SI_MESGQ must be still visible as -3. Solution: SI_MESGQ is defined in the same style as SI_TIMER using __SI_CODE macro. Details: Fortunately copy_siginfo_to_user copies si_code as short. So we can use remaining part of int value freely. __SI_CODE does the work. SI_MESGQ is in kernel: 6<<16 | (-3 & 0xffff) what is > 0 but to userspace is copied (short) SI_MESGQ == -3 Actual changes: Changes in include/asm-generic/siginfo.h __SI_MESGQ added in signal.h to represent inside-kernel prefix of SI_MESGQ. SI_MESGQ is redefined from -3 to __SI_CODE(__SI_MESGQ, -3) Except mips architecture those changes should be arch independent (asm-generic/siginfo.h is included in arch versions). On mips SI_MESGQ is redefined to -4 in order to be compatible with IRIX. But the same schema can be used. Change in copy_siginfo_to_user: We only add one line to order the same copy semantics as for _SI_RT. This change isn't very portable - some arch have its own copy_siginfo_to_user. All those should have similar change (but possibly not one-line as _SI_RT case was sometimes ignored because i wasn't used yet, e.g. see ia64 signal.c). Update: mq: only fail with invalid timespec if mq_timed{send,receive} needs to block From: Jakub Jelinek POSIX requires EINVAL to be set if: "The process or thread would have blocked, and the abs_timeout parameter specified a nanoseconds field value less than zero or greater than or equal to 1000 million." but 2.6.5-mm3 returns -EINVAL even if the process or thread would not block (if the queue is not empty for timedreceive or not full for timedsend). --- CREDITS | 17 + Documentation/filesystems/proc.txt | 25 + include/asm-generic/siginfo.h | 4 +- init/Kconfig | 18 + ipc/Makefile | 2 + ipc/mqueue.c | 1165 ++++++++++++++++++++++++++++++++++++ kernel/signal.c | 1 + 7 files changed, 1231 insertions(+), 1 deletion(-) create mode 100644 ipc/mqueue.c (limited to 'kernel') diff --git a/CREDITS b/CREDITS index dc9b943d10f1..52128c120f63 100644 --- a/CREDITS +++ b/CREDITS @@ -289,6 +289,15 @@ S: Via Delle Palme, 9 S: Terni 05100 S: Italy +N: Krzysztof Benedyczak +E: golbi@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~golbi +D: POSIX message queues fs (with M. Wronski) +S: ul. Podmiejska 52 +S: Radunica +S: 83-000 Pruszcz Gdanski +S: Poland + N: Randolph Bentson E: bentson@grieg.seaslug.org W: http://www.aa.net/~bentson/ @@ -3485,6 +3494,14 @@ S: 12725 SW Millikan Way, Suite 400 S: Beaverton, OR 97005 S: USA +N: Michal Wronski +E: wrona@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~wrona +D: POSIX message queues fs (with K. Benedyczak) +S: ul. Teczowa 23/12 +S: 80-680 Gdansk-Sobieszewo +S: Poland + N: Frank Xia E: qx@math.columbia.edu D: Xiafs filesystem [defunct] diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22fd3adcc96e..378722d5bb70 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -38,6 +38,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem ------------------------------------------------------------------------------ Preface @@ -1814,6 +1815,30 @@ The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem +---------------------------------------------------------- + +The "mqueue" filesystem provides the necessary kernel features to enable the +creation of a user space library that implements the POSIX message queues +API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System +Interfaces specification.) + +The "mqueue" filesystem contains values for determining/setting the amount of +resources used by the file system. + +/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the +maximum number of message queues allowed on the system. + +/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the +maximum number of messages in a queue value. In fact it is the limiting value +for another (user) limit which is set in mq_open invocation. This attribute of +a queue must be less or equal then msg_max. + +/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the +maximum message size value (it is every message queue's attribute set during +its creation). + + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index e95efd9e00c6..fe02b1a4d286 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -123,6 +123,7 @@ typedef struct siginfo { #define __SI_FAULT (3 << 16) #define __SI_CHLD (4 << 16) #define __SI_RT (5 << 16) +#define __SI_MESGQ (6 << 16) #define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) #else #define __SI_KILL 0 @@ -131,6 +132,7 @@ typedef struct siginfo { #define __SI_FAULT 0 #define __SI_CHLD 0 #define __SI_RT 0 +#define __SI_MESGQ 0 #define __SI_CODE(T,N) (N) #endif @@ -142,7 +144,7 @@ typedef struct siginfo { #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ #define SI_QUEUE -1 /* sent by sigqueue */ #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */ -#define SI_MESGQ -3 /* sent by real time mesq state change */ +#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */ #define SI_ASYNCIO -4 /* sent by AIO completion */ #define SI_SIGIO -5 /* sent by queued SIGIO */ #define SI_TKILL -6 /* sent by tkill system call */ diff --git a/init/Kconfig b/init/Kconfig index c10fec8ebe9e..9eff25e8f6ed 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -90,6 +90,24 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from . +config POSIX_MQUEUE + bool "POSIX Message Queues" + depends on EXPERIMENTAL + ---help--- + POSIX variant of message queues is a part of IPC. In POSIX message + queues every message has a priority which decides about succession + of receiving it by a process. If you want to compile and run + programs written e.g. for Solaris with use of its POSIX message + queues (functions mq_*) say Y here. To use this feature you will + also need mqueue library, available from + + + POSIX message queues are visible as a filesystem called 'mqueue' + and can be mounted somewhere if you want to do filesystem + operations on message queues. + + If unsure, say Y. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" help diff --git a/ipc/Makefile b/ipc/Makefile index 6cd32a30f03f..913790207d85 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -4,3 +4,5 @@ obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o +obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o + diff --git a/ipc/mqueue.c b/ipc/mqueue.c new file mode 100644 index 000000000000..4de249718675 --- /dev/null +++ b/ipc/mqueue.c @@ -0,0 +1,1165 @@ +/* + * POSIX message queues filesystem for Linux. + * + * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl) + * Michal Wronski (wrona@mat.uni.torun.pl) + * + * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com) + * Lockless receive & send, fd based notify: + * Manfred Spraul (manfred@colorfullife.com) + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define MQUEUE_MAGIC 0x19800202 +#define DIRENT_SIZE 20 +#define FILENT_SIZE 80 + +#define SEND 0 +#define RECV 1 + +#define STATE_NONE 0 +#define STATE_PENDING 1 +#define STATE_READY 2 + +#define NP_NONE ((void*)NOTIFY_NONE) +#define NP_WOKENUP ((void*)NOTIFY_WOKENUP) +#define NP_REMOVED ((void*)NOTIFY_REMOVED) +/* used by sysctl */ +#define FS_MQUEUE 1 +#define CTL_QUEUESMAX 2 +#define CTL_MSGMAX 3 +#define CTL_MSGSIZEMAX 4 + +/* default values */ +#define DFLT_QUEUESMAX 64 /* max number of message queues */ +#define DFLT_MSGMAX 40 /* max number of messages in each queue */ +#define HARD_MSGMAX (131072/sizeof(void*)) +#define DFLT_MSGSIZEMAX 16384 /* max message size */ + +struct ext_wait_queue { /* queue of sleeping tasks */ + struct task_struct *task; + struct list_head list; + struct msg_msg *msg; /* ptr of loaded message */ + int state; /* one of STATE_* values */ +}; + +struct mqueue_inode_info { + struct mq_attr attr; + struct msg_msg **messages; + + pid_t notify_owner; /* != 0 means notification registered */ + struct sigevent notify; + struct file *notify_filp; + + /* for tasks waiting for free space and messages, respectively */ + struct ext_wait_queue e_wait_q[2]; + wait_queue_head_t wait_q; + + unsigned long qsize; /* size of queue in memory (sum of all msgs) */ + spinlock_t lock; + struct inode vfs_inode; +}; + +static struct inode_operations mqueue_dir_inode_operations; +static struct file_operations mqueue_file_operations; +static struct file_operations mqueue_notify_fops; +static struct super_operations mqueue_super_ops; +static void remove_notification(struct mqueue_inode_info *info); + +static spinlock_t mq_lock; +static kmem_cache_t *mqueue_inode_cachep; +static struct vfsmount *mqueue_mnt; + +static unsigned int queues_count; +static unsigned int queues_max = DFLT_QUEUESMAX; +static unsigned int msg_max = DFLT_MSGMAX; +static unsigned int msgsize_max = DFLT_MSGSIZEMAX; + +static struct ctl_table_header * mq_sysctl_table; + +static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) +{ + return container_of(inode, struct mqueue_inode_info, vfs_inode); +} + +static struct inode *mqueue_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_ctime = inode->i_atime = + CURRENT_TIME; + + if (S_ISREG(mode)) { + struct mqueue_inode_info *info; + + inode->i_fop = &mqueue_file_operations; + inode->i_size = FILENT_SIZE; + /* mqueue specific info */ + info = MQUEUE_I(inode); + spin_lock_init(&info->lock); + init_waitqueue_head(&info->wait_q); + INIT_LIST_HEAD(&info->e_wait_q[0].list); + INIT_LIST_HEAD(&info->e_wait_q[1].list); + info->notify_owner = 0; + info->qsize = 0; + info->attr.mq_curmsgs = 0; + info->messages = NULL; + } else if (S_ISDIR(mode)) { + inode->i_nlink++; + inode->i_op = &mqueue_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + } + } + return inode; +} + +static int mqueue_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + + sb->s_flags = MS_NOUSER; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = MQUEUE_MAGIC; + sb->s_op = &mqueue_super_ops; + + inode = mqueue_get_inode(sb, S_IFDIR | S_IRWXUGO); + if (!inode) + return -ENOMEM; + + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + iput(inode); + return -ENOMEM; + } + + return 0; +} + +static struct super_block *mqueue_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return get_sb_single(fs_type, flags, data, mqueue_fill_super); +} + +static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&p->vfs_inode); +} + +static struct inode *mqueue_alloc_inode(struct super_block *sb) +{ + struct mqueue_inode_info *ei; + + ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void mqueue_destroy_inode(struct inode *inode) +{ + kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); +} + +static void mqueue_delete_inode(struct inode *inode) +{ + struct mqueue_inode_info *info; + int i; + + if (S_ISDIR(inode->i_mode)) { + clear_inode(inode); + return; + } + info = MQUEUE_I(inode); + spin_lock(&info->lock); + for (i = 0; i < info->attr.mq_curmsgs; i++) + free_msg(info->messages[i]); + kfree(info->messages); + spin_unlock(&info->lock); + + clear_inode(inode); + + spin_lock(&mq_lock); + queues_count--; + spin_unlock(&mq_lock); +} + +static int mqueue_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct inode *inode; + int error; + + spin_lock(&mq_lock); + if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) { + error = -ENOSPC; + goto out_lock; + } + queues_count++; + spin_unlock(&mq_lock); + + inode = mqueue_get_inode(dir->i_sb, mode); + if (!inode) { + error = -ENOMEM; + spin_lock(&mq_lock); + queues_count--; + goto out_lock; + } + + d_instantiate(dentry, inode); + dget(dentry); + return 0; +out_lock: + spin_unlock(&mq_lock); + return error; +} + +static int mqueue_flush_file(struct file *filp) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + + spin_lock(&info->lock); + if (current->tgid == info->notify_owner) + remove_notification(info); + + spin_unlock(&info->lock); + return 0; +} + +/* Adds current to info->e_wait_q[sr] before element with smaller prio */ +static void wq_add(struct mqueue_inode_info *info, int sr, + struct ext_wait_queue *ewp) +{ + struct ext_wait_queue *walk; + + ewp->task = current; + + list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { + if (walk->task->static_prio <= current->static_prio) { + list_add_tail(&ewp->list, &walk->list); + return; + } + } + list_add_tail(&ewp->list, &info->e_wait_q[sr].list); +} + +/* + * Puts current task to sleep. Caller must hold queue lock. After return + * lock isn't held. + * sr: SEND or RECV + */ +static int wq_sleep(struct mqueue_inode_info *info, int sr, + long timeout, struct ext_wait_queue *ewp) +{ + int retval; + signed long time; + + wq_add(info, sr, ewp); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_unlock(&info->lock); + time = schedule_timeout(timeout); + + while (ewp->state == STATE_PENDING) + cpu_relax(); + + if (ewp->state == STATE_READY) { + retval = 0; + goto out; + } + spin_lock(&info->lock); + if (ewp->state == STATE_READY) { + retval = 0; + goto out_unlock; + } + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + if (time == 0) { + retval = -ETIMEDOUT; + break; + } + } + list_del(&ewp->list); +out_unlock: + spin_unlock(&info->lock); +out: + return retval; +} + +/* + * Returns waiting task that should be serviced first or NULL if none exists + */ +static struct ext_wait_queue *wq_get_first_waiter( + struct mqueue_inode_info *info, int sr) +{ + struct list_head *ptr; + + ptr = info->e_wait_q[sr].list.prev; + if (ptr == &info->e_wait_q[sr].list) + return NULL; + return list_entry(ptr, struct ext_wait_queue, list); +} + +/* Auxiliary functions to manipulate messages' list */ +static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info) +{ + int k; + + k = info->attr.mq_curmsgs - 1; + while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) { + info->messages[k + 1] = info->messages[k]; + k--; + } + info->attr.mq_curmsgs++; + info->qsize += ptr->m_ts; + info->messages[k + 1] = ptr; +} + +static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) +{ + info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts; + return info->messages[info->attr.mq_curmsgs]; +} + +/* + * The next function is only to split too long sys_mq_timedsend + */ +static void __do_notify(struct mqueue_inode_info *info) +{ + /* notification + * invoked when there is registered process and there isn't process + * waiting synchronously for message AND state of queue changed from + * empty to not empty. Here we are sure that no one is waiting + * synchronously. */ + if (info->notify_owner && info->attr.mq_curmsgs == 1) { + /* sends signal */ + if (info->notify.sigev_notify == SIGEV_SIGNAL) { + struct siginfo sig_i; + + sig_i.si_signo = info->notify.sigev_signo; + sig_i.si_errno = 0; + sig_i.si_code = SI_MESGQ; + sig_i.si_value = info->notify.sigev_value; + sig_i.si_pid = current->tgid; + sig_i.si_uid = current->uid; + + kill_proc_info(info->notify.sigev_signo, + &sig_i, info->notify_owner); + } else if (info->notify.sigev_notify == SIGEV_THREAD) { + info->notify_filp->private_data = (void*)NP_WOKENUP; + wake_up(&info->wait_q); + } + /* after notification unregisters process */ + info->notify_owner = 0; + } +} + +static long prepare_timeout(const struct timespec __user *u_arg) +{ + struct timespec ts, nowts; + long timeout; + + if (u_arg) { + if (unlikely(copy_from_user(&ts, u_arg, + sizeof(struct timespec)))) + return -EFAULT; + + if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0 + || ts.tv_nsec >= NSEC_PER_SEC)) + return -EINVAL; + nowts = CURRENT_TIME; + /* first subtract as jiffies can't be too big */ + ts.tv_sec -= nowts.tv_sec; + if (ts.tv_nsec < nowts.tv_nsec) { + ts.tv_nsec += NSEC_PER_SEC; + ts.tv_sec--; + } + ts.tv_nsec -= nowts.tv_nsec; + if (ts.tv_sec < 0) + return 0; + + timeout = timespec_to_jiffies(&ts) + 1; + } else + return MAX_SCHEDULE_TIMEOUT; + + return timeout; +} + +/* + * File descriptor based notification, intended to be used to implement + * SIGEV_THREAD: + * SIGEV_THREAD means that a notification function should be called in the + * context of a new thread. The kernel can't do that. Therefore mq_notify + * calls with SIGEV_THREAD return a new file descriptor. A user space helper + * must create a new thread and then read from the given file descriptor. + * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must + * call the notification function. If it's NOTIFY_REMOVED, then the + * notification was removed. The file descriptor supports poll, thus one + * supervisor thread can manage multiple message queue notifications. + * + * The implementation must support multiple outstanding notifications: + * It's possible that a new notification is added and signaled before user + * space calls mqueue_notify_read for the previous notification. + * Therefore the notification state is stored in the private_data field of + * the file descriptor. + */ +static unsigned int mqueue_notify_poll(struct file *filp, + struct poll_table_struct *poll_tab) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + int retval; + + poll_wait(filp, &info->wait_q, poll_tab); + + if (filp->private_data == NP_NONE) + retval = 0; + else + retval = POLLIN | POLLRDNORM; + return retval; +} + +static ssize_t mqueue_notify_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + char result; + + if (!count) + return 0; + if (*ppos != 0) + return 0; + spin_lock(&info->lock); + while (filp->private_data == NP_NONE) { + DEFINE_WAIT(wait); + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + return -EAGAIN; + } + prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE); + spin_unlock(&info->lock); + schedule(); + finish_wait(&info->wait_q, &wait); + spin_lock(&info->lock); + } + spin_unlock(&info->lock); + result = (char)(unsigned long)filp->private_data; + if (put_user(result, buf)) + return -EFAULT; + *ppos = 1; + return 1; +} + +static int mqueue_notify_release(struct inode *inode, struct file *filp) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + + spin_lock(&info->lock); + if (info->notify_owner && info->notify_filp == filp) + info->notify_owner = 0; + filp->private_data = NP_REMOVED; + spin_unlock(&info->lock); + + return 0; +} + +static void remove_notification(struct mqueue_inode_info *info) +{ + if (info->notify.sigev_notify == SIGEV_THREAD) { + info->notify_filp->private_data = NP_REMOVED; + wake_up(&info->wait_q); + } + info->notify_owner = 0; +} + +/* + * Invoked when creating a new queue via sys_mq_open + */ +static struct file *do_create(struct dentry *dir, struct dentry *dentry, + int oflag, mode_t mode, struct mq_attr __user *u_attr) +{ + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + struct msg_msg **msgs = NULL; + struct mq_attr attr; + int ret; + + if (u_attr != NULL) { + if (copy_from_user(&attr, u_attr, sizeof(attr))) + return ERR_PTR(-EFAULT); + + if (attr.mq_maxmsg <= 0 || attr.mq_msgsize <= 0) + return ERR_PTR(-EINVAL); + if (capable(CAP_SYS_RESOURCE)) { + if (attr.mq_maxmsg > HARD_MSGMAX) + return ERR_PTR(-EINVAL); + } else { + if (attr.mq_maxmsg > msg_max || + attr.mq_msgsize > msgsize_max) + return ERR_PTR(-EINVAL); + } + } else { + attr.mq_maxmsg = DFLT_MSGMAX; + attr.mq_msgsize = DFLT_MSGSIZEMAX; + } + msgs = kmalloc(attr.mq_maxmsg * sizeof(*msgs), GFP_KERNEL); + if (!msgs) + return ERR_PTR(-ENOMEM); + + ret = vfs_create(dir->d_inode, dentry, mode, NULL); + if (ret) { + kfree(msgs); + return ERR_PTR(ret); + } + + inode = dentry->d_inode; + info = MQUEUE_I(inode); + + info->attr.mq_maxmsg = attr.mq_maxmsg; + info->attr.mq_msgsize = attr.mq_msgsize; + info->messages = msgs; + + filp = dentry_open(dentry, mqueue_mnt, oflag); + if (!IS_ERR(filp)) + dget(dentry); + + return filp; +} + +/* Opens existing queue */ +static struct file *do_open(struct dentry *dentry, int oflag) +{ +static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, + MAY_READ | MAY_WRITE }; + struct file *filp; + + if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) + return ERR_PTR(-EINVAL); + + if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) + return ERR_PTR(-EACCES); + + filp = dentry_open(dentry, mqueue_mnt, oflag); + + if (!IS_ERR(filp)) + dget(dentry); + + return filp; +} + +asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode, + struct mq_attr __user *u_attr) +{ + struct dentry *dentry; + struct file *filp; + char *name; + int fd, error; + + if (IS_ERR(name = getname(u_name))) + return PTR_ERR(name); + + fd = get_unused_fd(); + if (fd < 0) + goto out_putname; + + down(&mqueue_mnt->mnt_root->d_inode->i_sem); + dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name)); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_err; + } + mntget(mqueue_mnt); + + if (oflag & O_CREAT) { + if (dentry->d_inode) { /* entry already exists */ + filp = (oflag & O_EXCL) ? ERR_PTR(-EEXIST) : + do_open(dentry, oflag); + } else { + filp = do_create(mqueue_mnt->mnt_root, dentry, + oflag, mode, u_attr); + } + } else + filp = (dentry->d_inode) ? do_open(dentry, oflag) : + ERR_PTR(-ENOENT); + + dput(dentry); + + if (IS_ERR(filp)) { + error = PTR_ERR(filp); + goto out_putfd; + } + + fd_install(fd, filp); + goto out_upsem; + +out_putfd: + mntput(mqueue_mnt); + put_unused_fd(fd); +out_err: + fd = error; +out_upsem: + up(&mqueue_mnt->mnt_root->d_inode->i_sem); +out_putname: + putname(name); + return fd; +} + +asmlinkage long sys_mq_unlink(const char __user *u_name) +{ + int err; + char *name; + struct dentry *dentry; + struct inode *inode = NULL; + + name = getname(u_name); + if (IS_ERR(name)) + return PTR_ERR(name); + + down(&mqueue_mnt->mnt_root->d_inode->i_sem); + dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name)); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_err; + } + + if (permission(dentry->d_inode, MAY_WRITE, NULL)) { + err = -EACCES; + goto out_err; + } + inode = dentry->d_inode; + if (inode) + atomic_inc(&inode->i_count); + + err = vfs_unlink(dentry->d_parent->d_inode, dentry); +out_err: + dput(dentry); + +out_unlock: + up(&mqueue_mnt->mnt_root->d_inode->i_sem); + putname(name); + if (inode) + iput(inode); + + return err; +} + +/* Pipelined send and receive functions. + * + * If a receiver finds no waiting message, then it registers itself in the + * list of waiting receivers. A sender checks that list before adding the new + * message into the message array. If there is a waiting receiver, then it + * bypasses the message array and directly hands the message over to the + * receiver. + * The receiver accepts the message and returns without grabbing the queue + * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers + * are necessary. The same algorithm is used for sysv semaphores, see + * ipc/sem.c fore more details. + * + * The same algorithm is used for senders. + */ + +/* pipelined_send() - send a message directly to the task waiting in + * sys_mq_timedreceive() (without inserting message into a queue). */ +static inline void pipelined_send(struct mqueue_inode_info *info, + struct msg_msg *message, + struct ext_wait_queue *receiver) +{ + receiver->msg = message; + list_del(&receiver->list); + receiver->state = STATE_PENDING; + wake_up_process(receiver->task); + wmb(); + receiver->state = STATE_READY; +} + +/* pipelined_receive() - if there is task waiting in sys_mq_timedsend() + * gets its message and put to the queue (we have one free place for sure). */ +static inline void pipelined_receive(struct mqueue_inode_info *info) +{ + struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); + + if (!sender) + return; + + msg_insert(sender->msg, info); + list_del(&sender->list); + sender->state = STATE_PENDING; + wake_up_process(sender->task); + wmb(); + sender->state = STATE_READY; +} + +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct file *filp; + struct inode *inode; + struct ext_wait_queue wait; + struct ext_wait_queue *receiver; + struct msg_msg *msg_ptr; + struct mqueue_inode_info *info; + long timeout; + int ret; + + if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) + return -EINVAL; + + timeout = prepare_timeout(u_abs_timeout); + + ret = -EBADF; + filp = fget(mqdes); + if (unlikely(!filp)) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + if (unlikely((filp->f_flags & O_ACCMODE) == O_RDONLY)) + goto out_fput; + + if (unlikely(msg_len > info->attr.mq_msgsize)) { + ret = -EMSGSIZE; + goto out_fput; + } + + /* First try to allocate memory, before doing anything with + * existing queues. */ + msg_ptr = load_msg((void *)u_msg_ptr, msg_len); + if (unlikely(IS_ERR(msg_ptr))) { + ret = PTR_ERR(msg_ptr); + goto out_fput; + } + msg_ptr->m_ts = msg_len; + msg_ptr->m_type = msg_prio; + + spin_lock(&info->lock); + + if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + ret = -EAGAIN; + } else if (unlikely(timeout < 0)) { + spin_unlock(&info->lock); + ret = timeout; + } else { + wait.task = current; + wait.msg = (void *) msg_ptr; + wait.state = STATE_NONE; + ret = wq_sleep(info, SEND, timeout, &wait); + if (ret < 0) + free_msg(msg_ptr); + } + } else { + receiver = wq_get_first_waiter(info, RECV); + if (receiver) { + pipelined_send(info, msg_ptr, receiver); + } else { + /* adds message to the queue */ + msg_insert(msg_ptr, info); + __do_notify(info); + } + inode->i_atime = inode->i_mtime = inode->i_ctime = + CURRENT_TIME; + spin_unlock(&info->lock); + ret = 0; + } +out_fput: + fput(filp); +out: + return ret; +} + +asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + long timeout; + ssize_t ret; + struct msg_msg *msg_ptr; + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + struct ext_wait_queue wait; + + timeout = prepare_timeout(u_abs_timeout); + + ret = -EBADF; + filp = fget(mqdes); + if (unlikely(!filp)) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + if (unlikely((filp->f_flags & O_ACCMODE) == O_WRONLY)) + goto out_fput; + + /* checks if buffer is big enough */ + if (unlikely(msg_len < info->attr.mq_msgsize)) { + ret = -EMSGSIZE; + goto out_fput; + } + + spin_lock(&info->lock); + if (info->attr.mq_curmsgs == 0) { + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + ret = -EAGAIN; + msg_ptr = NULL; + } else if (unlikely(timeout < 0)) { + spin_unlock(&info->lock); + ret = timeout; + msg_ptr = NULL; + } else { + wait.task = current; + wait.state = STATE_NONE; + ret = wq_sleep(info, RECV, timeout, &wait); + msg_ptr = wait.msg; + } + } else { + msg_ptr = msg_get(info); + + inode->i_atime = inode->i_mtime = inode->i_ctime = + CURRENT_TIME; + + /* There is now free space in queue. */ + pipelined_receive(info); + spin_unlock(&info->lock); + ret = 0; + } + if (ret == 0) { + ret = msg_ptr->m_ts; + + if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) || + store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) { + ret = -EFAULT; + } + free_msg(msg_ptr); + } +out_fput: + fput(filp); +out: + return ret; +} + +/* + * Notes: the case when user wants us to deregister (with NULL as pointer + * or SIGEV_NONE) and he isn't currently owner of notification will be + * silently discarded. It isn't explicitly defined in the POSIX. + */ +asmlinkage long sys_mq_notify(mqd_t mqdes, + const struct sigevent __user *u_notification) +{ + int ret, fd; + struct file *filp, *nfilp; + struct inode *inode; + struct sigevent notification; + struct mqueue_inode_info *info; + + if (u_notification == NULL) { + notification.sigev_notify = SIGEV_NONE; + } else { + if (copy_from_user(¬ification, u_notification, + sizeof(struct sigevent))) + return -EFAULT; + + if (unlikely(notification.sigev_notify != SIGEV_NONE && + notification.sigev_notify != SIGEV_SIGNAL && + notification.sigev_notify != SIGEV_THREAD)) + return -EINVAL; + if (notification.sigev_notify == SIGEV_SIGNAL && + (notification.sigev_signo < 0 || + notification.sigev_signo > _NSIG)) { + return -EINVAL; + } + } + + ret = -EBADF; + filp = fget(mqdes); + if (!filp) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + ret = 0; + if (notification.sigev_notify == SIGEV_THREAD) { + ret = get_unused_fd(); + if (ret < 0) + goto out_fput; + fd = ret; + nfilp = get_empty_filp(); + if (!nfilp) { + ret = -ENFILE; + goto out_dropfd; + } + nfilp->private_data = NP_NONE; + nfilp->f_op = &mqueue_notify_fops; + nfilp->f_vfsmnt = mntget(mqueue_mnt); + nfilp->f_dentry = dget(filp->f_dentry); + nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping; + nfilp->f_mode = FMODE_READ; + } else { + nfilp = NULL; + fd = -1; + } + + spin_lock(&info->lock); + + if (notification.sigev_notify == SIGEV_NONE) { + if (info->notify_owner == current->tgid) { + remove_notification(info); + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + } else if (info->notify_owner) { + ret = -EBUSY; + } else if (notification.sigev_notify == SIGEV_THREAD) { + info->notify_filp = nfilp; + fd_install(fd, nfilp); + ret = fd; + fd = -1; + nfilp = NULL; + info->notify.sigev_notify = SIGEV_THREAD; + info->notify_owner = current->tgid; + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } else { + info->notify.sigev_signo = notification.sigev_signo; + info->notify.sigev_value = notification.sigev_value; + info->notify.sigev_notify = SIGEV_SIGNAL; + info->notify_owner = current->tgid; + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + spin_unlock(&info->lock); +out_dropfd: + if (fd != -1) + put_unused_fd(fd); +out_fput: + fput(filp); +out: + return ret; +} + +asmlinkage long sys_mq_getsetattr(mqd_t mqdes, + const struct mq_attr __user *u_mqstat, + struct mq_attr __user *u_omqstat) +{ + int ret; + struct mq_attr mqstat, omqstat; + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + + if (u_mqstat != NULL) { + if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr))) + return -EFAULT; + } + + ret = -EBADF; + filp = fget(mqdes); + if (!filp) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + spin_lock(&info->lock); + + omqstat = info->attr; + omqstat.mq_flags = filp->f_flags; + if (u_mqstat) { + if (mqstat.mq_flags & O_NONBLOCK) + filp->f_flags |= O_NONBLOCK; + else + filp->f_flags &= ~O_NONBLOCK; + + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + + spin_unlock(&info->lock); + + ret = 0; + if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat, + sizeof(struct mq_attr))) + ret = -EFAULT; + +out_fput: + fput(filp); +out: + return ret; +} + +static struct inode_operations mqueue_dir_inode_operations = { + .lookup = simple_lookup, + .create = mqueue_create, + .unlink = simple_unlink, +}; + +static struct file_operations mqueue_file_operations = { + .flush = mqueue_flush_file, +}; + +static struct file_operations mqueue_notify_fops = { + .poll = mqueue_notify_poll, + .read = mqueue_notify_read, + .release = mqueue_notify_release, +}; + + +static struct super_operations mqueue_super_ops = { + .alloc_inode = mqueue_alloc_inode, + .destroy_inode = mqueue_destroy_inode, + .delete_inode = mqueue_delete_inode, + .drop_inode = generic_delete_inode, +}; + +static struct file_system_type mqueue_fs_type = { + .name = "mqueue", + .get_sb = mqueue_get_sb, + .kill_sb = kill_anon_super, +}; + +static int msg_max_limit_min = DFLT_MSGMAX; +static int msg_max_limit_max = HARD_MSGMAX; + +static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX; +static int msg_maxsize_limit_max = INT_MAX; + +static ctl_table mq_sysctls[] = { + { + .ctl_name = CTL_QUEUESMAX, + .procname = "queues_max", + .data = &queues_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_MSGMAX, + .procname = "msg_max", + .data = &msg_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &msg_max_limit_min, + .extra2 = &msg_max_limit_max, + }, + { + .ctl_name = CTL_MSGSIZEMAX, + .procname = "msgsize_max", + .data = &msgsize_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &msg_maxsize_limit_min, + .extra2 = &msg_maxsize_limit_max, + }, + { .ctl_name = 0 } +}; + +static ctl_table mq_sysctl_dir[] = { + { + .ctl_name = FS_MQUEUE, + .procname = "mqueue", + .mode = 0555, + .child = mq_sysctls, + }, + { .ctl_name = 0 } +}; + +static ctl_table mq_sysctl_root[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = mq_sysctl_dir, + }, + { .ctl_name = 0 } +}; + +static int __init init_mqueue_fs(void) +{ + int error; + + mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", + sizeof(struct mqueue_inode_info), 0, + SLAB_HWCACHE_ALIGN, init_once, NULL); + if (mqueue_inode_cachep == NULL) + return -ENOMEM; + + mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0); + if (!mq_sysctl_table) { + error = -ENOMEM; + goto out_cache; + } + + error = register_filesystem(&mqueue_fs_type); + if (error) + goto out_sysctl; + + if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) { + error = PTR_ERR(mqueue_mnt); + goto out_filesystem; + } + + /* internal initialization - not common for vfs */ + queues_count = 0; + spin_lock_init(&mq_lock); + + return 0; + +out_filesystem: + unregister_filesystem(&mqueue_fs_type); +out_sysctl: + unregister_sysctl_table(mq_sysctl_table); +out_cache: + if (kmem_cache_destroy(mqueue_inode_cachep)) { + printk(KERN_INFO + "mqueue_inode_cache: not all structures were freed\n"); + } + return error; +} + +__initcall(init_mqueue_fs); diff --git a/kernel/signal.c b/kernel/signal.c index 32992a71683b..e6b7904df68f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2047,6 +2047,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_stime, &to->si_stime); break; case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); -- cgit v1.2.3 From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:45 -0700 Subject: [PATCH] compat emulation for posix message queues From: Arnd Bergmann I have tested the code with the open posix test suite and found the same four failures for both 64-bit and compat mode, most tests pass. The patch is against -mc1, but I guess it also applies to the other trees around. What worries me more than mq_attr compatibility is the conversion of struct sigevent, which might turn out really hard when more fields in there are used. AFAICS, the only other part in the kernel ABI is sys_timer_create(), so maybe it's not too late to deprecate the current structure and create a structure that can be used properly for compat syscalls. --- arch/ia64/ia32/ia32_signal.c | 7 +- arch/mips/kernel/signal32.c | 7 +- arch/s390/kernel/compat_signal.c | 5 +- arch/sparc64/kernel/signal32.c | 7 +- arch/x86_64/ia32/ia32_signal.c | 6 +- include/asm-ppc64/ppc32.h | 14 --- include/linux/compat.h | 17 ++++ include/linux/mqueue.h | 4 +- include/linux/posix_types.h | 1 + include/linux/syscalls.h | 1 - include/linux/types.h | 1 + ipc/Makefile | 3 +- ipc/compat_mq.c | 196 +++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 5 + 14 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 ipc/compat_mq.c (limited to 'kernel') diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c index 8b1374c172b6..bb1e836fb227 100644 --- a/arch/ia64/ia32/ia32_signal.c +++ b/arch/ia64/ia32/ia32_signal.c @@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from) err |= __get_user(to->si_band, &from->si_band); err |= __get_user(to->si_fd, &from->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __get_user(to->si_pid, &from->si_pid); + err |= __get_user(to->si_uid, &from->si_uid); + err |= __get_user(to->si_int, &from->si_int); + break; } } return err; diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 5c1489f4fdc2..c52074f84300 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 44fe6e477e92..373040404a5a 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { switch (from->si_code >> 16) { + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_int, &to->si_int); + /* fallthrough */ case __SI_KILL >> 16: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); @@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) break; default: break; - /* case __SI_RT: This is not generated by the kernel as of now. */ } } return err; diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index cc3019d6dd65..e2f62a666d8c 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); err |= __put_user((long)from->si_addr, &to->si_addr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index bce5fbc5be2c..1a828de6a55d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h index 53865a8c4f8d..7338ea298a19 100644 --- a/include/asm-ppc64/ppc32.h +++ b/include/asm-ppc64/ppc32.h @@ -141,20 +141,6 @@ struct ucontext32 { struct mcontext32 uc_mcontext; }; -typedef struct compat_sigevent { - compat_sigval_t sigev_value; - int sigev_signo; - int sigev_notify; - union { - int _pad[SIGEV_PAD_SIZE]; - int _tid; - struct { - compat_uptr_t _function; - compat_uptr_t _attribute; - } _sigev_thread; - } _sigev_un; -} compat_sigevent_t; - struct ipc_kludge_32 { unsigned int msgp; int msgtyp; diff --git a/include/linux/compat.h b/include/linux/compat.h index 7b82209ab4ab..796204f59bd9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -13,6 +13,7 @@ #include #include +#include #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) @@ -90,6 +91,22 @@ typedef union compat_sigval { compat_uptr_t sival_ptr; } compat_sigval_t; +typedef struct compat_sigevent { + compat_sigval_t sigev_value; + compat_int_t sigev_signo; + compat_int_t sigev_notify; + union { + compat_int_t _pad[SIGEV_PAD_SIZE]; + compat_int_t _tid; + + struct { + compat_uptr_t _function; + compat_uptr_t _attribute; + } _sigev_thread; + } _sigev_un; +} compat_sigevent_t; + + long compat_sys_semctl(int first, int second, int third, void __user *uptr); long compat_sys_msgsnd(int first, int second, int third, void __user *uptr); long compat_sys_msgrcv(int first, int second, int msgtyp, int third, diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index fdab3b8ee242..fc40b774b913 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -18,9 +18,9 @@ #ifndef _LINUX_MQUEUE_H #define _LINUX_MQUEUE_H -#define MQ_PRIO_MAX 32768 +#include -typedef int mqd_t; +#define MQ_PRIO_MAX 32768 struct mq_attr { long mq_flags; /* message queue flags */ diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h index 3ee2ed9de1db..f04c98cf44f3 100644 --- a/include/linux/posix_types.h +++ b/include/linux/posix_types.h @@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int); /* Type of a SYSV IPC key. */ typedef int __kernel_key_t; +typedef int __kernel_mqd_t; #include diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7ee5f67abb5f..89ffe55898f2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,7 +48,6 @@ struct timex; struct timezone; struct tms; struct utimbuf; -typedef int mqd_t; struct mq_attr; #include diff --git a/include/linux/types.h b/include/linux/types.h index 3b407b06b48f..93f5f3653561 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -31,6 +31,7 @@ typedef __kernel_key_t key_t; typedef __kernel_suseconds_t suseconds_t; typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; +typedef __kernel_mqd_t mqd_t; #ifdef __KERNEL__ typedef __kernel_uid32_t uid_t; diff --git a/ipc/Makefile b/ipc/Makefile index 913790207d85..0a6d626cd794 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o -obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o +obj_mq-$(CONFIG_COMPAT) += compat_mq.o +obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c new file mode 100644 index 000000000000..1520df89c424 --- /dev/null +++ b/ipc/compat_mq.c @@ -0,0 +1,196 @@ +/* + * ipc/compat_mq.c + * 32 bit emulation for POSIX message queue system calls + * + * Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Author: Arnd Bergmann + */ + +#include +#include +#include +#include +#include + +#include + +struct compat_mq_attr { + compat_long_t mq_flags; /* message queue flags */ + compat_long_t mq_maxmsg; /* maximum number of messages */ + compat_long_t mq_msgsize; /* maximum message size */ + compat_long_t mq_curmsgs; /* number of messages currently queued */ + compat_long_t __reserved[4]; /* ignored for input, zeroed for output */ +}; + +static inline int get_compat_mq_attr(struct mq_attr *attr, + const struct compat_mq_attr __user *uattr) +{ + if (verify_area(VERIFY_READ, uattr, sizeof *uattr)) + return -EFAULT; + + return __get_user(attr->mq_flags, &uattr->mq_flags) + | __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __get_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +static inline int put_compat_mq_attr(const struct mq_attr *attr, + struct compat_mq_attr __user *uattr) +{ + if (clear_user(uattr, sizeof *uattr)) + return -EFAULT; + + return __put_user(attr->mq_flags, &uattr->mq_flags) + | __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __put_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +asmlinkage long compat_sys_mq_open(const char __user *u_name, + int oflag, compat_mode_t mode, + struct compat_mq_attr __user *u_attr) +{ + struct mq_attr attr; + mm_segment_t oldfs; + char *name; + long ret; + + if ((oflag & O_CREAT) == 0 || !u_attr) + return sys_mq_open(u_name, oflag, mode, 0); + + if (get_compat_mq_attr(&attr, u_attr)) + return -EFAULT; + + name = getname(u_name); + if (IS_ERR(name)) + return PTR_ERR(name); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_open(name, oflag, mode, &attr); + set_fs(oldfs); + + putname(name); + return ret; +} + +static struct timespec __user *compat_prepare_timeout( + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec ts, __user *u_ts; + + if (!u_abs_timeout) + return 0; + + u_ts = compat_alloc_user_space(sizeof(*u_ts)); + if (get_compat_timespec(&ts, u_abs_timeout) + || copy_to_user(u_ts, &ts, sizeof(*u_ts))) + return ERR_PTR(-EFAULT); + + return u_ts; +} + +asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, + const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec __user *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len, + msg_prio, u_ts); +} + +asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, + char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len, + u_msg_prio, u_ts); +} + +static int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event) +{ + if (verify_area(VERIFY_READ, u_event, sizeof(*u_event))) + return -EFAULT; + + return __get_user(event->sigev_value.sival_int, + &u_event->sigev_value.sival_int) + | __get_user(event->sigev_signo, &u_event->sigev_signo) + | __get_user(event->sigev_notify, &u_event->sigev_notify) + | __get_user(event->sigev_notify_thread_id, + &u_event->sigev_notify_thread_id); +} + +asmlinkage long compat_sys_mq_notify(mqd_t mqdes, + const struct compat_sigevent __user *u_notification) +{ + mm_segment_t oldfs; + struct sigevent notification; + char cookie[NOTIFY_COOKIE_LEN]; + compat_uptr_t u_cookie; + long ret; + + if (!u_notification) + return sys_mq_notify(mqdes, 0); + + if (get_compat_sigevent(¬ification, u_notification)) + return -EFAULT; + + if (notification.sigev_notify == SIGEV_THREAD) { + u_cookie = (compat_uptr_t)notification.sigev_value.sival_int; + if (copy_from_user(cookie, compat_ptr(u_cookie), + NOTIFY_COOKIE_LEN)) { + return -EFAULT; + } + notification.sigev_value.sival_ptr = cookie; + } + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_notify(mqdes, ¬ification); + set_fs(oldfs); + + return ret; +} + +asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, + const struct compat_mq_attr __user *u_mqstat, + struct compat_mq_attr __user *u_omqstat) +{ + struct mq_attr mqstat, omqstat; + struct mq_attr *p_mqstat = 0, *p_omqstat = 0; + mm_segment_t oldfs; + long ret; + + if (u_mqstat) { + p_mqstat = &mqstat; + if (get_compat_mq_attr(p_mqstat, u_mqstat)) + return -EFAULT; + } + + if (u_omqstat) + p_omqstat = &omqstat; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat); + set_fs(oldfs); + + if (ret) + return ret; + + return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0; +} diff --git a/kernel/sys.c b/kernel/sys.c index 7d1bf5c57aca..81f9e02f2071 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend) cond_syscall(sys_mq_timedreceive) cond_syscall(sys_mq_notify) cond_syscall(sys_mq_getsetattr) +cond_syscall(compat_sys_mq_open) +cond_syscall(compat_sys_mq_timedsend) +cond_syscall(compat_sys_mq_timedreceive) +cond_syscall(compat_sys_mq_notify) +cond_syscall(compat_sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From 7860b37198b0650f51bfafebac820386b552a071 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:56:46 -0700 Subject: [PATCH] move job control fields from task_struct to signal_struct From: Roland McGrath This patch moves all the fields relating to job control from task_struct to signal_struct, so that all this info is properly per-process rather than being per-thread. --- arch/ia64/kernel/unaligned.c | 2 +- arch/sparc64/solaris/misc.c | 2 +- drivers/char/n_tty.c | 3 +- drivers/char/rocket.c | 2 +- drivers/char/sx.c | 2 +- drivers/char/tty_io.c | 116 +++++++++++++++++----------------------- drivers/char/vt.c | 2 +- drivers/char/vt_ioctl.c | 3 +- drivers/net/slip.c | 2 +- drivers/s390/char/keyboard.c | 2 +- fs/binfmt_elf.c | 4 +- fs/compat_ioctl.c | 2 +- fs/dquot.c | 10 ++-- fs/exec.c | 5 ++ fs/open.c | 2 +- fs/proc/array.c | 22 ++++---- include/linux/sched.h | 17 +++--- kernel/acct.c | 2 +- kernel/exit.c | 22 ++++---- kernel/fork.c | 10 ++-- kernel/pid.c | 8 +-- kernel/signal.c | 5 +- kernel/sys.c | 18 +++---- net/bridge/netfilter/ebtables.c | 2 +- net/ipv4/netfilter/ipt_owner.c | 2 +- net/ipv6/netfilter/ip6t_owner.c | 2 +- 26 files changed, 133 insertions(+), 136 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c index 2247254be7ac..b1a68e4367bc 100644 --- a/arch/ia64/kernel/unaligned.c +++ b/arch/ia64/kernel/unaligned.c @@ -1337,7 +1337,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) * be holding locks... */ if (user_mode(regs)) - tty_write_message(current->tty, buf); + tty_write_message(current->signal->tty, buf); buf[len-1] = '\0'; /* drop '\r' */ printk(KERN_WARNING "%s", buf); /* watch for command names containing %s */ } diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index ea7b2c439653..cea38c0cbb5c 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -402,7 +402,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid) Solaris setpgrp and setsid? */ ret = sys_setpgid(0, 0); if (ret) return ret; - current->tty = NULL; + current->signal->tty = NULL; return process_group(current); } case 2: /* getsid */ diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c index 0c02e2debbb1..08f46259e183 100644 --- a/drivers/char/n_tty.c +++ b/drivers/char/n_tty.c @@ -999,7 +999,8 @@ do_it_again: /* NOTE: not yet done after every sleep pending a thorough check of the logic of this change. -- jlc */ /* don't stop on /dev/console */ - if (file->f_op->write != redirected_tty_write && current->tty == tty) { + if (file->f_op->write != redirected_tty_write && + current->signal->tty == tty) { if (tty->pgrp <= 0) printk("read_chan: tty->pgrp <= 0!\n"); else if (process_group(current) != tty->pgrp) { diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index 38544de9fbd9..b0da37eab8e7 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -953,7 +953,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp) /* * Info->count is now 1; so it's safe to sleep now. */ - info->session = current->session; + info->session = current->signal->session; info->pgrp = process_group(current); if ((info->flags & ROCKET_INITIALIZED) == 0) { diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 25c95fbc65d3..643163b08a8f 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -1420,7 +1420,7 @@ static int sx_open (struct tty_struct * tty, struct file * filp) line = tty->index; sx_dprintk (SX_DEBUG_OPEN, "%d: opening line %d. tty=%p ctty=%p, np=%d)\n", - current->pid, line, tty, current->tty, sx_nports); + current->pid, line, tty, current->signal->tty, sx_nports); if ((line < 0) || (line >= SX_NPORTS) || (line >= sx_nports)) return -ENODEV; diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 0ba52078f637..e4607d86a755 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -321,7 +321,7 @@ struct tty_driver *get_tty_driver(dev_t device, int *index) */ int tty_check_change(struct tty_struct * tty) { - if (current->tty != tty) + if (current->signal->tty != tty) return 0; if (tty->pgrp <= 0) { printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); @@ -486,17 +486,14 @@ void do_tty_hangup(void *data) if (tty->session > 0) { struct list_head *l; for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { - task_t *task = p; - do { - if (task->tty == tty) - task->tty = NULL; - if (task->leader) { - send_group_sig_info(SIGHUP, SEND_SIG_PRIV, task); - send_group_sig_info(SIGCONT, SEND_SIG_PRIV, task); - } - } while_each_thread(p, task); + if (p->signal->tty == tty) + p->signal->tty = NULL; + if (!p->signal->leader) + continue; + send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p); + send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); if (tty->pgrp > 0) - p->tty_old_pgrp = tty->pgrp; + p->signal->tty_old_pgrp = tty->pgrp; } } read_unlock(&tasklist_lock); @@ -575,15 +572,15 @@ void disassociate_ctty(int on_exit) lock_kernel(); - tty = current->tty; + tty = current->signal->tty; if (tty) { tty_pgrp = tty->pgrp; if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) tty_vhangup(tty); } else { - if (current->tty_old_pgrp) { - kill_pg(current->tty_old_pgrp, SIGHUP, on_exit); - kill_pg(current->tty_old_pgrp, SIGCONT, on_exit); + if (current->signal->tty_old_pgrp) { + kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit); + kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit); } unlock_kernel(); return; @@ -594,17 +591,13 @@ void disassociate_ctty(int on_exit) kill_pg(tty_pgrp, SIGCONT, on_exit); } - current->tty_old_pgrp = 0; + current->signal->tty_old_pgrp = 0; tty->session = 0; tty->pgrp = -1; read_lock(&tasklist_lock); - for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) { - task_t *task = p; - do { - task->tty = NULL; - } while_each_thread(p, task); - } + for_each_task_pid(current->signal->session, PIDTYPE_SID, p, l, pid) + p->signal->tty = NULL; read_unlock(&tasklist_lock); unlock_kernel(); } @@ -1257,20 +1250,11 @@ static void release_dev(struct file * filp) struct pid *pid; read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { - task_t *task = p; - do { - task->tty = NULL; - } while_each_thread(p, task); - } - if (o_tty) { - for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) { - task_t *task = p; - do { - task->tty = NULL; - } while_each_thread(p, task); - } - } + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->signal->tty = NULL; + if (o_tty) + for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) + p->signal->tty = NULL; read_unlock(&tasklist_lock); } @@ -1341,10 +1325,10 @@ static int tty_open(struct inode * inode, struct file * filp) retry_open: noctty = filp->f_flags & O_NOCTTY; if (device == MKDEV(TTYAUX_MAJOR,0)) { - if (!current->tty) + if (!current->signal->tty) return -ENXIO; - driver = current->tty->driver; - index = current->tty->index; + driver = current->signal->tty->driver; + index = current->signal->tty->index; filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ /* noctty = 1; */ goto got_driver; @@ -1445,14 +1429,14 @@ got_driver: goto retry_open; } if (!noctty && - current->leader && - !current->tty && + current->signal->leader && + !current->signal->tty && tty->session == 0) { task_lock(current); - current->tty = tty; + current->signal->tty = tty; task_unlock(current); - current->tty_old_pgrp = 0; - tty->session = current->session; + current->signal->tty_old_pgrp = 0; + tty->session = current->signal->session; tty->pgrp = process_group(current); } return 0; @@ -1510,7 +1494,7 @@ static int tiocsti(struct tty_struct *tty, char * arg) { char ch, mbz = 0; - if ((current->tty != tty) && !capable(CAP_SYS_ADMIN)) + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, arg)) return -EFAULT; @@ -1601,14 +1585,14 @@ static int tiocsctty(struct tty_struct *tty, int arg) struct pid *pid; task_t *p; - if (current->leader && - (current->session == tty->session)) + if (current->signal->leader && + (current->signal->session == tty->session)) return 0; /* * The process must be a session leader and * not have a controlling tty already. */ - if (!current->leader || current->tty) + if (!current->signal->leader || current->signal->tty) return -EPERM; if (tty->session > 0) { /* @@ -1621,21 +1605,17 @@ static int tiocsctty(struct tty_struct *tty, int arg) */ read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { - task_t *task = p; - do { - task->tty = NULL; - } while_each_thread(p, task); - } + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->signal->tty = NULL; read_unlock(&tasklist_lock); } else return -EPERM; } task_lock(current); - current->tty = tty; + current->signal->tty = tty; task_unlock(current); - current->tty_old_pgrp = 0; - tty->session = current->session; + current->signal->tty_old_pgrp = 0; + tty->session = current->signal->session; tty->pgrp = process_group(current); return 0; } @@ -1646,7 +1626,7 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ - if (tty == real_tty && current->tty != real_tty) + if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; return put_user(real_tty->pgrp, arg); } @@ -1660,15 +1640,15 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t return -ENOTTY; if (retval) return retval; - if (!current->tty || - (current->tty != real_tty) || - (real_tty->session != current->session)) + if (!current->signal->tty || + (current->signal->tty != real_tty) || + (real_tty->session != current->signal->session)) return -ENOTTY; if (get_user(pgrp, (pid_t *) arg)) return -EFAULT; if (pgrp < 0) return -EINVAL; - if (session_of_pgrp(pgrp) != current->session) + if (session_of_pgrp(pgrp) != current->signal->session) return -EPERM; real_tty->pgrp = pgrp; return 0; @@ -1680,7 +1660,7 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t * * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ - if (tty == real_tty && current->tty != real_tty) + if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; if (real_tty->session <= 0) return -ENOTTY; @@ -1838,12 +1818,12 @@ int tty_ioctl(struct inode * inode, struct file * file, clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNOTTY: - if (current->tty != tty) + if (current->signal->tty != tty) return -ENOTTY; - if (current->leader) + if (current->signal->leader) disassociate_ctty(0); task_lock(current); - current->tty = NULL; + current->signal->tty = NULL; task_unlock(current); return 0; case TIOCSCTTY: @@ -1947,9 +1927,9 @@ static void __do_SAK(void *arg) tty->driver->flush_buffer(tty); read_lock(&tasklist_lock); for_each_task_pid(session, PIDTYPE_SID, p, l, pid) { - if (p->tty == tty || session > 0) { + if (p->signal->tty == tty || session > 0) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): p->session==tty->session\n", + " (%s): p->signal->session==tty->session\n", p->pid, p->comm); send_sig(SIGKILL, p, 1); continue; diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 2febed52e19f..a1a59abc915c 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -2278,7 +2278,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg) if (tty->driver->type != TTY_DRIVER_TYPE_CONSOLE) return -EINVAL; - if (current->tty != tty && !capable(CAP_SYS_ADMIN)) + if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(type, (char *)arg)) return -EFAULT; diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index d8c6acc8e62c..0685fe7be2d1 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -382,7 +382,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ perm = 0; - if (current->tty == tty || capable(CAP_SYS_TTY_CONFIG)) + if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG)) perm = 1; kbd = kbd_table + console; @@ -1221,4 +1221,3 @@ void change_console(unsigned int new_console) complete_change_console(new_console); } - diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 601df52ebb29..e783ac0fa71e 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -1307,7 +1307,7 @@ static int sl_ioctl(struct net_device *dev,struct ifreq *rq,int cmd) /* Resolve race condition, when ioctl'ing hanged up and opened by another process device. */ - if (sl->tty != current->tty && sl->pid != current->pid) { + if (sl->tty != current->signal->tty && sl->pid != current->pid) { spin_unlock_bh(&sl->lock); return -EPERM; } diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c index 892ebc7739b0..b124ebb7fc9b 100644 --- a/drivers/s390/char/keyboard.c +++ b/drivers/s390/char/keyboard.c @@ -471,7 +471,7 @@ kbd_ioctl(struct kbd_data *kbd, struct file *file, * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG. */ - perm = current->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG); + perm = current->signal->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG); switch (cmd) { case KDGKBTYPE: return put_user(KB_101, (char*) arg); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9cc7cc648b42..e5b79a294c80 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1129,7 +1129,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pid = p->pid; prstatus->pr_ppid = p->parent->pid; prstatus->pr_pgrp = process_group(p); - prstatus->pr_sid = p->session; + prstatus->pr_sid = p->signal->session; jiffies_to_timeval(p->utime, &prstatus->pr_utime); jiffies_to_timeval(p->stime, &prstatus->pr_stime); jiffies_to_timeval(p->cutime, &prstatus->pr_cutime); @@ -1157,7 +1157,7 @@ static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pid = p->pid; psinfo->pr_ppid = p->parent->pid; psinfo->pr_pgrp = process_group(p); - psinfo->pr_sid = p->session; + psinfo->pr_sid = p->signal->session; i = p->state ? ffz(~p->state) + 1 : 0; psinfo->pr_state = i; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 23baed6180ff..de45d833d0f4 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1604,7 +1604,7 @@ static int vt_check(struct file *file) * To have permissions to do most of the vt ioctls, we either have * to be the owner of the tty, or super-user. */ - if (current->tty == tty || capable(CAP_SYS_ADMIN)) + if (current->signal->tty == tty || capable(CAP_SYS_ADMIN)) return 1; return 0; } diff --git a/fs/dquot.c b/fs/dquot.c index e6b39e66207a..5749044d028e 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -689,12 +689,12 @@ static void print_warning(struct dquot *dquot, const char warntype) if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags))) return; - tty_write_message(current->tty, dquot->dq_sb->s_id); + tty_write_message(current->signal->tty, dquot->dq_sb->s_id); if (warntype == ISOFTWARN || warntype == BSOFTWARN) - tty_write_message(current->tty, ": warning, "); + tty_write_message(current->signal->tty, ": warning, "); else - tty_write_message(current->tty, ": write failed, "); - tty_write_message(current->tty, quotatypes[dquot->dq_type]); + tty_write_message(current->signal->tty, ": write failed, "); + tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]); switch (warntype) { case IHARDWARN: msg = " file limit reached.\n"; @@ -715,7 +715,7 @@ static void print_warning(struct dquot *dquot, const char warntype) msg = " block quota exceeded.\n"; break; } - tty_write_message(current->tty, msg); + tty_write_message(current->signal->tty, msg); } static inline void flush_warnings(struct dquot **dquots, char *warntype) diff --git a/fs/exec.c b/fs/exec.c index 225afb0d94e5..62bf2c537abd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -601,6 +601,11 @@ static inline int de_thread(struct task_struct *tsk) newsig->group_stop_count = 0; newsig->curr_target = NULL; init_sigpending(&newsig->shared_pending); + + newsig->pgrp = oldsig->pgrp; + newsig->session = oldsig->session; + newsig->leader = oldsig->leader; + newsig->tty_old_pgrp = oldsig->tty_old_pgrp; } if (thread_group_empty(current)) diff --git a/fs/open.c b/fs/open.c index 9a9ce5be4dbc..ce11096afcad 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1037,7 +1037,7 @@ EXPORT_SYMBOL(sys_close); asmlinkage long sys_vhangup(void) { if (capable(CAP_SYS_TTY_CONFIG)) { - tty_vhangup(current->tty); + tty_vhangup(current->signal->tty); return 0; } return -EPERM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 7af62577287e..ac9ccac5d1ee 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -168,7 +168,7 @@ static inline char * task_state(struct task_struct *p, char *buffer) p->pid && p->ptrace ? p->parent->pid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); - read_unlock(&tasklist_lock); + read_unlock(&tasklist_lock); task_lock(p); buffer += sprintf(buffer, "FDSize:\t%d\n" @@ -301,7 +301,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer) sigset_t sigign, sigcatch; char state; int res; - pid_t ppid; + pid_t ppid, pgid = -1, sid = -1; int num_threads = 0; struct mm_struct *mm; @@ -311,10 +311,6 @@ int proc_pid_stat(struct task_struct *task, char * buffer) mm = task->mm; if(mm) mm = mmgrab(mm); - if (task->tty) { - tty_pgrp = task->tty->pgrp; - tty_nr = new_encode_dev(tty_devnum(task->tty)); - } task_unlock(task); if (mm) { down_read(&mm->mmap_sem); @@ -335,7 +331,15 @@ int proc_pid_stat(struct task_struct *task, char * buffer) collect_sigign_sigcatch(task, &sigign, &sigcatch); spin_unlock_irq(&task->sighand->siglock); } - read_unlock(&tasklist_lock); + if (task->signal) { + if (task->signal->tty) { + tty_pgrp = task->signal->tty->pgrp; + tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); + } + pgid = process_group(task); + sid = task->signal->session; + } + read_unlock(&tasklist_lock); /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ @@ -352,8 +356,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer) task->comm, state, ppid, - process_group(task), - task->session, + pgid, + sid, tty_nr, tty_pgrp, task->flags, diff --git a/include/linux/sched.h b/include/linux/sched.h index 054b3c0d5962..5a1229121123 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -269,6 +269,15 @@ struct signal_struct { /* thread group stop support, overloads group_exit_code too */ int group_stop_count; + + /* job control IDs */ + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + /* boolean value for session group leader */ + int leader; + + struct tty_struct *tty; /* NULL if no tty */ }; /* @@ -398,12 +407,7 @@ struct task_struct { unsigned long personality; int did_exec:1; pid_t pid; - pid_t __pgrp; /* Accessed via process_group() */ - pid_t tty_old_pgrp; - pid_t session; pid_t tgid; - /* boolean value for session group leader */ - int leader; /* * pointers to (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with @@ -446,7 +450,6 @@ struct task_struct { char comm[16]; /* file system info */ int link_count, total_link_count; - struct tty_struct *tty; /* NULL if no tty */ /* ipc stuff */ struct sysv_sem sysvsem; /* CPU-specific state of this task */ @@ -499,7 +502,7 @@ struct task_struct { static inline pid_t process_group(struct task_struct *tsk) { - return tsk->group_leader->__pgrp; + return tsk->signal->pgrp; } extern void __put_task_struct(struct task_struct *tsk); diff --git a/kernel/acct.c b/kernel/acct.c index 9dbab88b2d31..b417066778a7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -347,7 +347,7 @@ static void do_acct_process(long exitcode, struct file *file) /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; - ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0; + ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0; ac.ac_flag = 0; if (current->flags & PF_FORKNOEXEC) diff --git a/kernel/exit.c b/kernel/exit.c index 308f6959add6..810eebd77559 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -136,13 +136,13 @@ int session_of_pgrp(int pgrp) read_lock(&tasklist_lock); for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) - if (p->session > 0) { - sid = p->session; + if (p->signal->session > 0) { + sid = p->signal->session; goto out; } p = find_task_by_pid(pgrp); if (p) - sid = p->session; + sid = p->signal->session; out: read_unlock(&tasklist_lock); @@ -170,7 +170,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) || p->real_parent->pid == 1) continue; if (process_group(p->real_parent) != pgrp - && p->real_parent->session == p->session) { + && p->real_parent->signal->session == p->signal->session) { ret = 0; break; } @@ -259,14 +259,14 @@ void __set_special_pids(pid_t session, pid_t pgrp) { struct task_struct *curr = current; - if (curr->session != session) { + if (curr->signal->session != session) { detach_pid(curr, PIDTYPE_SID); - curr->session = session; + curr->signal->session = session; attach_pid(curr, PIDTYPE_SID, session); } if (process_group(curr) != pgrp) { detach_pid(curr, PIDTYPE_PGID); - curr->group_leader->__pgrp = pgrp; + curr->signal->pgrp = pgrp; attach_pid(curr, PIDTYPE_PGID, pgrp); } } @@ -341,7 +341,7 @@ void daemonize(const char *name, ...) exit_mm(current); set_special_pids(1, 1); - current->tty = NULL; + current->signal->tty = NULL; /* Block and flush all signals */ sigfillset(&blocked); @@ -564,7 +564,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) * outside, so the child pgrp is now orphaned. */ if ((process_group(p) != process_group(father)) && - (p->session == father->session)) { + (p->signal->session == father->signal->session)) { int pgrp = process_group(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { @@ -675,7 +675,7 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; if ((process_group(t) != process_group(tsk)) && - (t->session == tsk->session) && + (t->signal->session == tsk->signal->session) && will_become_orphaned_pgrp(process_group(tsk), tsk) && has_stopped_jobs(process_group(tsk))) { __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); @@ -780,7 +780,7 @@ asmlinkage NORET_TYPE void do_exit(long code) exit_itimers(tsk); exit_thread(); - if (tsk->leader) + if (tsk->signal->leader) disassociate_ctty(1); module_put(tsk->thread_info->exec_domain->module); diff --git a/kernel/fork.c b/kernel/fork.c index a1f20cabbdd3..d2dd97e866bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -811,6 +811,12 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->curr_target = NULL; init_sigpending(&sig->shared_pending); + sig->tty = current->signal->tty; + sig->pgrp = process_group(current); + sig->session = current->signal->session; + sig->leader = 0; /* session leadership doesn't inherit */ + sig->tty_old_pgrp = 0; + return 0; } @@ -935,8 +941,6 @@ struct task_struct *copy_process(unsigned long clone_flags, init_timer(&p->real_timer); p->real_timer.data = (unsigned long) p; - p->leader = 0; /* session leadership doesn't inherit */ - p->tty_old_pgrp = 0; p->utime = p->stime = 0; p->cutime = p->cstime = 0; p->lock_depth = -1; /* -1 = no lock */ @@ -1055,7 +1059,7 @@ struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PGID, process_group(p)); - attach_pid(p, PIDTYPE_SID, p->session); + attach_pid(p, PIDTYPE_SID, p->signal->session); if (p->pid) __get_cpu_var(process_counts)++; } else diff --git a/kernel/pid.c b/kernel/pid.c index 4c85144759c5..6ed44f56ca45 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread) attach_pid(thread, PIDTYPE_PID, thread->pid); attach_pid(thread, PIDTYPE_TGID, thread->tgid); - attach_pid(thread, PIDTYPE_PGID, leader->__pgrp); - attach_pid(thread, PIDTYPE_SID, thread->session); + attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); + attach_pid(thread, PIDTYPE_SID, thread->signal->session); list_add_tail(&thread->tasks, &init_task.tasks); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); - attach_pid(leader, PIDTYPE_PGID, leader->__pgrp); - attach_pid(leader, PIDTYPE_SID, leader->session); + attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp); + attach_pid(leader, PIDTYPE_SID, leader->signal->session); } /* diff --git a/kernel/signal.c b/kernel/signal.c index e6b7904df68f..7a4b479a6f45 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -588,7 +588,8 @@ static int check_kill_permission(int sig, struct siginfo *info, error = -EPERM; if ((!info || ((unsigned long)info != 1 && (unsigned long)info != 2 && SI_FROMUSER(info))) - && ((sig != SIGCONT) || (current->session != t->session)) + && ((sig != SIGCONT) || + (current->signal->session != t->signal->session)) && (current->euid ^ t->suid) && (current->euid ^ t->uid) && (current->uid ^ t->suid) && (current->uid ^ t->uid) && !capable(CAP_KILL)) @@ -1103,7 +1104,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid) retval = -ESRCH; read_lock(&tasklist_lock); for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { - if (!p->leader) + if (!p->signal->leader) continue; err = group_send_sig_info(sig, info, p); if (retval) diff --git a/kernel/sys.c b/kernel/sys.c index 81f9e02f2071..9d57482758f3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -990,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->parent == current || p->real_parent == current) { err = -EPERM; - if (p->session != current->session) + if (p->signal->session != current->signal->session) goto out; err = -EACCES; if (p->did_exec) @@ -1002,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) } err = -EPERM; - if (p->leader) + if (p->signal->leader) goto out; if (pgid != pid) { @@ -1011,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct list_head *l; for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) - if (p->session == current->session) + if (p->signal->session == current->signal->session) goto ok_pgid; goto out; } @@ -1023,7 +1023,7 @@ ok_pgid: if (process_group(p) != pgid) { detach_pid(p, PIDTYPE_PGID); - p->group_leader->__pgrp = pgid; + p->signal->pgrp = pgid; attach_pid(p, PIDTYPE_PGID, pgid); } @@ -1065,7 +1065,7 @@ asmlinkage long sys_getpgrp(void) asmlinkage long sys_getsid(pid_t pid) { if (!pid) { - return current->session; + return current->signal->session; } else { int retval; struct task_struct *p; @@ -1077,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid) if(p) { retval = security_task_getsid(p); if (!retval) - retval = p->session; + retval = p->signal->session; } read_unlock(&tasklist_lock); return retval; @@ -1098,10 +1098,10 @@ asmlinkage long sys_setsid(void) if (pid) goto out; - current->leader = 1; + current->signal->leader = 1; __set_special_pids(current->pid, current->pid); - current->tty = NULL; - current->tty_old_pgrp = 0; + current->signal->tty = NULL; + current->signal->tty_old_pgrp = 0; err = process_group(current); out: write_unlock_irq(&tasklist_lock); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 33b687d60efe..f76563312ee4 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -46,7 +46,7 @@ static void print_string(char *str) struct tty_struct *my_tty; /* The tty for the current task */ - my_tty = current->tty; + my_tty = current->signal->tty; if (my_tty != NULL) { my_tty->driver->write(my_tty, 0, str, strlen(str)); my_tty->driver->write(my_tty, 0, "\015\012", 2); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index a1529896ec1b..91c3fd3f1f8f 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -95,7 +95,7 @@ match_sid(const struct sk_buff *skb, pid_t sid) read_lock(&tasklist_lock); do_each_thread(g, p) { struct files_struct *files; - if (p->session != sid) + if (p->signal->session != sid) continue; task_lock(p); diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c index 02e5ee4e7418..0bb9c661b73c 100644 --- a/net/ipv6/netfilter/ip6t_owner.c +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -61,7 +61,7 @@ match_sid(const struct sk_buff *skb, pid_t sid) read_lock(&tasklist_lock); do_each_thread(g, p) { struct files_struct *files; - if (p->session != sid) + if (p->signal->session != sid) continue; task_lock(p); -- cgit v1.2.3 From af70f7673155616ffd004d551e1b612002a58bf0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:56:59 -0700 Subject: [PATCH] Fix page allocator lower zone protection for NUMA From: Martin Hicks This changes __alloc_pages() so it uses precalculated values for the "min". This should prevent the problem of min incrementing from zone to zone across many nodes on a NUMA machine. The result of falling back to other nodes with the old incremental min calculations was that the min value became very large. --- include/linux/mmzone.h | 39 ++++++++++--- kernel/sysctl.c | 2 +- mm/page_alloc.c | 150 +++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 159 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b5398fa7be88..51b8f3f67741 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -54,6 +54,15 @@ struct per_cpu_pageset { struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ } ____cacheline_aligned_in_smp; +#define ZONE_DMA 0 +#define ZONE_NORMAL 1 +#define ZONE_HIGHMEM 2 + +#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */ +#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */ + +#define GFP_ZONEMASK 0x03 + /* * On machines where it is needed (eg PCs) we divide physical memory * into multiple physical zones. On a PC we have 3 zones: @@ -70,6 +79,19 @@ struct zone { spinlock_t lock; unsigned long free_pages; unsigned long pages_min, pages_low, pages_high; + /* + * protection[] is a pre-calculated number of extra pages that must be + * available in a zone in order for __alloc_pages() to allocate memory + * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must + * be "(1<zone_pgdat->node_zones) + /** * for_each_pgdat - helper macro to iterate over all nodes * @pgdat - pointer to a pg_data_t variable @@ -299,7 +318,9 @@ static inline int is_normal(struct zone *zone) struct ctl_table; struct file; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, - void __user *, size_t *); + void __user *, size_t *); +int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *); #include /* Returns the number of the current Node. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f5f3123b0522..f2c8c8ce4926 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -722,7 +722,7 @@ static ctl_table vm_table[] = { .data = &sysctl_lower_zone_protection, .maxlen = sizeof(sysctl_lower_zone_protection), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &lower_zone_protection_sysctl_handler, .strategy = &sysctl_intvec, .extra1 = &zero, }, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9764a4e78e45..c87ca3dd2f11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, struct task_struct *p = current; int i; int cold; + int alloc_type; int do_retry; might_sleep_if(wait); @@ -564,28 +565,27 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, if (zones[0] == NULL) /* no zones in the zonelist */ return NULL; + alloc_type = zone_idx(zones[0]); + /* Go through the zonelist once, looking for a zone with enough free */ - min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - unsigned long local_low; + + min = (1<protection[alloc_type]; /* - * This is the fabled 'incremental min'. We let real-time tasks - * dip their real-time paws a little deeper into reserves. + * We let real-time tasks dip their real-time paws a little + * deeper into reserves. */ - local_low = z->pages_low; if (rt_task(p)) - local_low >>= 1; - min += local_low; + min -= z->pages_low >> 1; if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) - goto got_pg; + goto got_pg; } - min += z->pages_low * sysctl_lower_zone_protection; } /* we're somewhat low on memory, failed to find what we needed */ @@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, wakeup_kswapd(zones[i]); /* Go through the zonelist again, taking __GFP_HIGH into account */ - min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { - unsigned long local_min; struct zone *z = zones[i]; - local_min = z->pages_min; + min = (1<protection[alloc_type]; + if (gfp_mask & __GFP_HIGH) - local_min >>= 2; + min -= z->pages_low >> 2; if (rt_task(p)) - local_min >>= 1; - min += local_min; + min -= z->pages_low >> 1; + if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) goto got_pg; } - min += local_min * sysctl_lower_zone_protection; } /* here we're in the low on memory slow path */ @@ -642,18 +640,17 @@ rebalance: p->flags &= ~PF_MEMALLOC; /* go through the zonelist yet one more time */ - min = 1UL << order; for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - min += z->pages_min; + min = (1UL << order) + z->protection[alloc_type]; + if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { page = buffered_rmqueue(z, order, cold); if (page) goto got_pg; } - min += z->pages_low * sysctl_lower_zone_protection; } /* @@ -1056,6 +1053,8 @@ void show_free_areas(void) ps.nr_page_table_pages); for_each_zone(zone) { + int i; + show_node(zone); printk("%s" " free:%lukB" @@ -1075,6 +1074,10 @@ void show_free_areas(void) K(zone->nr_inactive), K(zone->present_pages) ); + printk("protections[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %lu", zone->protection[i]); + printk("\n"); } for_each_zone(zone) { @@ -1272,7 +1275,7 @@ static void __init build_zonelists(pg_data_t *pgdat) j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j++] = NULL; - } + } } #endif /* CONFIG_NUMA */ @@ -1744,6 +1747,93 @@ void __init page_alloc_init(void) hotcpu_notifier(page_alloc_cpu_notify, 0); } +static unsigned long higherzone_val(struct zone *z, int max_zone, + int alloc_type) +{ + int z_idx = zone_idx(z); + struct zone *higherzone; + unsigned long pages; + + /* there is no higher zone to get a contribution from */ + if (z_idx == MAX_NR_ZONES-1) + return 0; + + higherzone = &z->zone_pgdat->node_zones[z_idx+1]; + + /* We always start with the higher zone's protection value */ + pages = higherzone->protection[alloc_type]; + + /* + * We get a lower-zone-protection contribution only if there are + * pages in the higher zone and if we're not the highest zone + * in the current zonelist. e.g., never happens for GFP_DMA. Happens + * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA + * and ZONE_NORMAL for a GFP_HIGHMEM allocation. + */ + if (higherzone->present_pages && z_idx < alloc_type) + pages += higherzone->pages_low * sysctl_lower_zone_protection; + + return pages; +} + +/* + * setup_per_zone_protection - called whenver min_free_kbytes or + * sysctl_lower_zone_protection changes. Ensures that each zone + * has a correct pages_protected value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + * + * This algorithm is way confusing. I tries to keep the same behavior + * as we had with the incremental min iterative algorithm. + */ +static void setup_per_zone_protection(void) +{ + struct pglist_data *pgdat; + struct zone *zones, *zone; + int max_zone; + int i, j; + + for_each_pgdat(pgdat) { + zones = pgdat->node_zones; + + for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++) + if (zones[i].present_pages) + max_zone = i; + + /* + * For each of the different allocation types: + * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + /* + * For each of the zones: + * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA + */ + for (j = MAX_NR_ZONES-1; j >= 0; j--) { + zone = &zones[j]; + + /* + * We never protect zones that don't have memory + * in them (j>max_zone) or zones that aren't in + * the zonelists for a certain type of + * allocation (j>i). We have to assign these to + * zero because the lower zones take + * contributions from the higher zones. + */ + if (j > max_zone || j > i) { + zone->protection[i] = 0; + continue; + } + /* + * The contribution of the next higher zone + */ + zone->protection[i] = higherzone_val(zone, + max_zone, i); + zone->protection[i] += zone->pages_low; + } + } + } +} + /* * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures * that the pages_{min,low,high} values for each zone are set correctly @@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void) unsigned long flags; /* Calculate total number of !ZONE_HIGHMEM pages */ - for_each_zone(zone) + for_each_zone(zone) { if (!is_highmem(zone)) lowmem_pages += zone->present_pages; + } for_each_zone(zone) { spin_lock_irqsave(&zone->lru_lock, flags); @@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void) if (min_free_kbytes > 16384) min_free_kbytes = 16384; setup_per_zone_pages_min(); + setup_per_zone_protection(); return 0; } module_init(init_per_zone_pages_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call setup_per_zone_pages_min() whenever min_free_kbytes + * that we can call two helper functions whenever min_free_kbytes * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, @@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length); setup_per_zone_pages_min(); + setup_per_zone_protection(); + return 0; +} + +/* + * lower_zone_protection_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_protection() + * whenever sysctl_lower_zone_protection changes. + */ +int lower_zone_protection_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length) +{ + proc_dointvec_minmax(table, write, file, buffer, length); + setup_per_zone_protection(); return 0; } -- cgit v1.2.3 From b9e55f3d300af426885d7b0a13e45cd2841118a2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:58:40 -0700 Subject: [PATCH] slab: updates for per-arch alignments From: Manfred Spraul Description: Right now kmem_cache_create automatically decides about the alignment of allocated objects. The automatic decisions are sometimes wrong: - for some objects, it's better to keep them as small as possible to reduce the memory usage. Ingo already added a parameter to kmem_cache_create for the sigqueue cache, but it wasn't implemented. - for s390, normal kmalloc must be 8-byte aligned. With debugging enabled, the default allocation was 4-bytes. This means that s390 cannot enable slab debugging. - arm26 needs 1 kB aligned objects. Previously this was impossible to generate, therefore arm has its own allocator in arm26/machine/small_page.c - most objects should be cache line aligned, to avoid false sharing. But the cache line size was set at compile time, often to 128 bytes for generic kernels. This wastes memory. The new code uses the runtime determined cache line size instead. - some caches want an explicit alignment. One example are the pte_chain objects: they must find the start of the object with addr&mask. Right now pte_chain objects are scaled to the cache line size, because that was the only alignment that could be generated reliably. The implementation reuses the "offset" parameter of kmem_cache_create and now uses it to pass in the requested alignment. offset was ignored by the current implementation, and the only user I found is sigqueue, which intended to set the alignment. In the long run, it might be interesting for the main tree: due to the 128 byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9 inodes - 20% memory recovered for Athlon systems. For generic kernels running on P6 cpus (i.e. 32 byte cachelines), it means Number of objects per page: ext2_inode_cache: 8 instead of 7 ext3_inode_cache: 8 instead of 7 fat_inode_cache: 9 instead of 7 rpc_tasks: 24 instead of 15 tcp_tw_bucket: 40 instead of 30 arp_cache: 40 instead of 30 nfs_write_data: 9 instead of 7 --- arch/i386/mm/init.c | 4 +- include/asm-i386/processor.h | 2 + kernel/fork.c | 7 ++- mm/rmap.c | 2 +- mm/slab.c | 135 +++++++++++++++++++++++++------------------ 5 files changed, 89 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index a9923661f317..040862e6c6a0 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -530,18 +530,18 @@ void __init pgtable_cache_init(void) { if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), PTRS_PER_PMD*sizeof(pmd_t), 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pmd_ctor, NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); } pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), PTRS_PER_PGD*sizeof(pgd_t), 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, pgd_ctor, PTRS_PER_PMD == 1 ? pgd_dtor : NULL); if (!pgd_cache) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 3a5e0ff2a20c..0ebe1aa1afb0 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -403,6 +403,8 @@ struct tss_struct { unsigned long stack[64]; } __attribute__((packed)); +#define ARCH_MIN_TASKALIGN 16 + struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; diff --git a/kernel/fork.c b/kernel/fork.c index d2dd97e866bb..315a06125e65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function); void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN 0 +#endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - sizeof(struct task_struct),0, - SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + sizeof(struct task_struct),ARCH_MIN_TASKALIGN, + 0, NULL, NULL); if (!task_struct_cachep) panic("fork_init(): cannot create task_struct SLAB cache"); #endif diff --git a/mm/rmap.c b/mm/rmap.c index 7af41a9b9a4e..c1c7325996a3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags) void __init pte_chain_init(void) { pte_chain_cache = kmem_cache_create( "pte_chain", + sizeof(struct pte_chain), sizeof(struct pte_chain), 0, - SLAB_MUST_HWCACHE_ALIGN, pte_chain_ctor, NULL); diff --git a/mm/slab.c b/mm/slab.c index d54728b6af32..b1c015cb0a02 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -121,6 +121,14 @@ /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN 0 +#endif + /* Legal flag mask for kmem_cache_create(). */ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ @@ -268,6 +276,7 @@ struct kmem_cache_s { unsigned int colour_off; /* colour offset */ unsigned int colour_next; /* cache colouring */ kmem_cache_t *slabp_cache; + unsigned int slab_size; unsigned int dflags; /* dynamic flags */ /* constructor func */ @@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = { .objsize = sizeof(kmem_cache_t), .flags = SLAB_NO_REAP, .spinlock = SPIN_LOCK_UNLOCKED, - .colour_off = L1_CACHE_BYTES, .name = "kmem_cache", +#if DEBUG + .reallen = sizeof(kmem_cache_t), +#endif }; /* Guard access to the cache-chain. */ @@ -535,7 +546,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep) } /* Cal the num objs, wastage, and bytes left over for a given slab size. */ -static void cache_estimate (unsigned long gfporder, size_t size, +static void cache_estimate (unsigned long gfporder, size_t size, size_t align, int flags, size_t *left_over, unsigned int *num) { int i; @@ -548,7 +559,7 @@ static void cache_estimate (unsigned long gfporder, size_t size, extra = sizeof(kmem_bufctl_t); } i = 0; - while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) + while (i*size + ALIGN(base+i*extra, align) <= wastage) i++; if (i > 0) i--; @@ -558,7 +569,7 @@ static void cache_estimate (unsigned long gfporder, size_t size, *num = i; wastage -= i*size; - wastage -= L1_CACHE_ALIGN(base+i*extra); + wastage -= ALIGN(base+i*extra, align); *left_over = wastage; } @@ -705,16 +716,20 @@ void __init kmem_cache_init(void) init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_estimate(0, cache_cache.objsize, 0, - &left_over, &cache_cache.num); + cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); + + cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, + &left_over, &cache_cache.num); if (!cache_cache.num) BUG(); cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour_next = 0; - + cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -728,7 +743,7 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ sizes->cs_cachep = kmem_cache_create( names->name, sizes->cs_size, - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + ARCH_KMALLOC_MINALIGN, 0, NULL, NULL); if (!sizes->cs_cachep) BUG(); @@ -740,7 +755,7 @@ void __init kmem_cache_init(void) sizes->cs_dmacachep = kmem_cache_create( names->name_dma, sizes->cs_size, - 0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL); + ARCH_KMALLOC_MINALIGN, SLAB_CACHE_DMA, NULL, NULL); if (!sizes->cs_dmacachep) BUG(); @@ -1056,7 +1071,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. - * @offset: The offset to use within the page. + * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. * @dtor: A destructor for the objects. @@ -1081,16 +1096,15 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) * %SLAB_NO_REAP - Don't automatically reap this cache when we're under * memory pressure. * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * %SLAB_HWCACHE_ALIGN - This flag has no effect and will be removed soon. + * */ kmem_cache_t * -kmem_cache_create (const char *name, size_t size, size_t offset, +kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) { - size_t left_over, align, slab_size; + size_t left_over, slab_size; kmem_cache_t *cachep = NULL; /* @@ -1101,7 +1115,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset, (size < BYTES_PER_WORD) || (size > (1< size)) { + (align < 0)) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); @@ -1118,22 +1132,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset, #if FORCED_DEBUG /* - * Enable redzoning and last user accounting, except - * - for caches with forced alignment: redzoning would violate the - * alignment - * - for caches with large objects, if the increased size would - * increase the object size above the next power of two: caches - * with object sizes just above a power of two have a significant - * amount of internal fragmentation + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size + * above the next power of two: caches with object sizes just above a + * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)) - && !(flags & SLAB_MUST_HWCACHE_ALIGN)) { + if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) flags |= SLAB_RED_ZONE|SLAB_STORE_USER; - } flags |= SLAB_POISON; #endif #endif - /* * Always checks flags, a caller might be expecting debug * support which isn't available. @@ -1141,15 +1149,23 @@ kmem_cache_create (const char *name, size_t size, size_t offset, if (flags & ~CREATE_MASK) BUG(); + if (align) { + /* minimum supported alignment: */ + if (align < BYTES_PER_WORD) + align = BYTES_PER_WORD; + + /* combinations of forced alignment and advanced debugging is + * not yet implemented. + */ + flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + } + /* Get cache's description obj. */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; memset(cachep, 0, sizeof(kmem_cache_t)); -#if DEBUG - cachep->reallen = size; -#endif /* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. @@ -1160,30 +1176,31 @@ kmem_cache_create (const char *name, size_t size, size_t offset, } #if DEBUG + cachep->reallen = size; + if (flags & SLAB_RED_ZONE) { - /* - * There is no point trying to honour cache alignment - * when redzoning. - */ - flags &= ~SLAB_HWCACHE_ALIGN; + /* redzoning only works with word aligned caches */ + align = BYTES_PER_WORD; + /* add space for red zone words */ cachep->dbghead += BYTES_PER_WORD; size += 2*BYTES_PER_WORD; } if (flags & SLAB_STORE_USER) { - flags &= ~SLAB_HWCACHE_ALIGN; - size += BYTES_PER_WORD; /* add space */ + /* user store requires word alignment and + * one word storage behind the end of the real + * object. + */ + align = BYTES_PER_WORD; + size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size > 128 && cachep->reallen > L1_CACHE_BYTES && size < PAGE_SIZE) { + if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { cachep->dbghead += PAGE_SIZE - size; size = PAGE_SIZE; } #endif #endif - align = BYTES_PER_WORD; - if (flags & SLAB_HWCACHE_ALIGN) - align = L1_CACHE_BYTES; /* Determine if the slab management is 'on' or 'off' slab. */ if (size >= (PAGE_SIZE>>3)) @@ -1193,13 +1210,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset, */ flags |= CFLGS_OFF_SLAB; - if (flags & SLAB_HWCACHE_ALIGN) { - /* Need to adjust size so that objs are cache aligned. */ - /* Small obj size, can get at least two per cache line. */ + if (!align) { + /* Default alignment: compile time specified l1 cache size. + * Except if an object is really small, then squeeze multiple + * into one cacheline. + */ + align = cache_line_size(); while (size <= align/2) align /= 2; - size = (size+align-1)&(~(align-1)); } + size = ALIGN(size, align); /* Cal size (in pages) of slabs, and the num of objs per slab. * This could be made much more intelligent. For now, try to avoid @@ -1209,7 +1229,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset, do { unsigned int break_flag = 0; cal_wastage: - cache_estimate(cachep->gfporder, size, flags, + cache_estimate(cachep->gfporder, size, align, flags, &left_over, &cachep->num); if (break_flag) break; @@ -1243,7 +1263,8 @@ next: cachep = NULL; goto opps; } - slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab)); + slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then @@ -1254,14 +1275,17 @@ next: left_over -= slab_size; } - /* Offset must be a multiple of the alignment. */ - offset += (align-1); - offset &= ~(align-1); - if (!offset) - offset = L1_CACHE_BYTES; - cachep->colour_off = offset; - cachep->colour = left_over/offset; + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ + slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); + } + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < align) + cachep->colour_off = align; + cachep->colour = left_over/cachep->colour_off; + cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) @@ -1543,8 +1567,7 @@ static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep, return NULL; } else { slabp = objp+colour_off; - colour_off += L1_CACHE_ALIGN(cachep->num * - sizeof(kmem_bufctl_t) + sizeof(struct slab)); + colour_off += cachep->slab_size; } slabp->inuse = 0; slabp->colouroff = colour_off; -- cgit v1.2.3 From 07ebe427f0289a322c96c38906bb3bb7aacf15b6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:58:52 -0700 Subject: [PATCH] set mod->waiter before calling stop_machine From: Rusty Russell mod->waiter needs to be set before we try to stop the module: setting it in __try_stop_module means it gets set to the kthread, not rmmod. --- kernel/module.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 16587e133b1b..a472deef9bdf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -493,7 +493,6 @@ static inline int __try_stop_module(void *_sref) } /* Mark it as dying. */ - sref->mod->waiter = current; sref->mod->state = MODULE_STATE_GOING; return 0; } @@ -588,6 +587,9 @@ sys_delete_module(const char __user *name_user, unsigned int flags) } } + /* Set this up before setting mod->state */ + mod->waiter = current; + /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); -- cgit v1.2.3 From 3f66b056e1b56427eec0b26e0a20ac08fb8a6dc9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:59:32 -0700 Subject: [PATCH] pmdisk: fix strcmp in sysfs store From: Herbert Xu This patch fixes the sysfs store functions for pmdisk when the input contains a trailing newline. --- kernel/power/disk.c | 7 ++++++- kernel/power/main.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 2d4cf319b8e1..7e035a9b42d1 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -285,11 +285,16 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) { int error = 0; int i; + int len; + char *p; u32 mode = 0; + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + down(&pm_sem); for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { - if (!strcmp(buf,pm_disk_modes[i])) { + if (!strncmp(buf, pm_disk_modes[i], len)) { mode = i; break; } diff --git a/kernel/power/main.c b/kernel/power/main.c index fd212e7ecd9f..d582906fecc6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -218,10 +218,15 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n { u32 state = PM_SUSPEND_STANDBY; char ** s; + char *p; int error; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; for (s = &pm_states[state]; *s; s++, state++) { - if (!strcmp(buf,*s)) + if (!strncmp(buf, *s, len)) break; } if (*s) -- cgit v1.2.3 From 5362a3548872e1d7c7b3926f1519999b315e9825 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:59:58 -0700 Subject: [PATCH] Fix sys_time() to get subtick correction from the new xtime From: "La Monte H.P. Yarroll" This is a Scott Wood patch against 2.6.3. Use gettimeofday() rather than xtime.tv_sec in sys_time(), since sys_stime() uses settimeofday() and thus subtracts the subtick correction from the new xtime. stime() used settimeofday(), but time() did not use gettimeofday(). Since settimeofday() subtracts out the current intra-tick correction, and nsec was 0 (since stime() only allows seconds), this resulted in xtime being slightly earlier than the time that was set. If time() had used gettimeofday(), the correction would have been applied, and everything would be fine. However, instead time just reads the current xtime.tv_sec, so if time() is called immediately after stime(), you'll usually get a value one second earlier. --- arch/ia64/ia32/sys_ia32.c | 7 ++++--- arch/parisc/kernel/sys_parisc32.c | 14 ++++++++------ arch/x86_64/ia32/sys_ia32.c | 7 ++++--- kernel/time.c | 7 ++++--- 4 files changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c index 38aaf7b08bbd..a1d008067121 100644 --- a/arch/ia64/ia32/sys_ia32.c +++ b/arch/ia64/ia32/sys_ia32.c @@ -1090,10 +1090,11 @@ asmlinkage long sys32_time (int *tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i, tloc)) i = -EFAULT; diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index d59c93baf0c2..7159953b2c44 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -388,14 +388,16 @@ static inline long get_ts32(struct timespec *o, struct compat_timeval *i) asmlinkage long sys32_time(compat_time_t *tloc) { - time_t now = get_seconds(); - compat_time_t now32 = now; + struct timeval tv; - if (tloc) - if (put_user(now32, tloc)) - now32 = -EFAULT; + do_gettimeofday(&tv); + compat_time_t now32 = tv.tv_sec; - return now32; + if (tloc) + if (put_user(now32, tloc)) + now32 = -EFAULT; + + return now32; } asmlinkage int diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 4d19333ead18..47c23419f55c 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -832,10 +832,11 @@ sys32_writev(int fd, struct compat_iovec *vector, u32 count) asmlinkage long sys32_time(int * tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i,tloc)) i = -EFAULT; diff --git a/kernel/time.c b/kernel/time.c index 33a6fe086304..142a4bd5771e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -51,10 +51,11 @@ EXPORT_SYMBOL(sys_tz); asmlinkage long sys_time(int * tloc) { int i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; - /* SMP: This is fairly trivial. We grab CURRENT_TIME and - stuff it to user space. No side effects */ - i = get_seconds(); if (tloc) { if (put_user(i,tloc)) i = -EFAULT; -- cgit v1.2.3 From ce334bb8f0f084112dcfe96214cacfa0afba7e10 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 23:04:10 -0700 Subject: [PATCH] export complete_all() From: Mike Waychison Export complete_all for module use. --- kernel/sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b42029abe679..c2d1c0317130 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1847,7 +1847,6 @@ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exc __wake_up_common(q, mode, nr_exclusive, 0); spin_unlock_irqrestore(&q->lock, flags); } - EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ void fastcall complete(struct completion *x) @@ -1860,7 +1859,6 @@ void fastcall complete(struct completion *x) 1, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } - EXPORT_SYMBOL(complete); void fastcall complete_all(struct completion *x) @@ -1873,6 +1871,7 @@ void fastcall complete_all(struct completion *x) 0, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } +EXPORT_SYMBOL(complete_all); void fastcall __sched wait_for_completion(struct completion *x) { @@ -1894,7 +1893,6 @@ void fastcall __sched wait_for_completion(struct completion *x) x->done--; spin_unlock_irq(&x->wait.lock); } - EXPORT_SYMBOL(wait_for_completion); #define SLEEP_ON_VAR \ -- cgit v1.2.3 From 8e1aabbc236128b9e696ae61235b17165bb73ada Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 23:06:08 -0700 Subject: [PATCH] Strip quotes from kernel parameters From: Rusty Russell Agustin Martin pointed out that this doesn't work: options ide-mod options="ide=nodma hdc=cdrom" The quotes are understood by kernel/params.c (ie. it skips over spaces inside them), but are not stripped before handing to the underlying function. They should be. --- kernel/params.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 4d9a71b743c5..59667bce9ce0 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -96,6 +96,13 @@ static char *next_arg(char *args, char **param, char **val) else { args[equals] = '\0'; *val = args + equals + 1; + + /* Don't include quotes in value. */ + if (**val == '"') { + (*val)++; + if (args[i-1] == '"') + args[i-1] = '\0'; + } } if (args[i]) { -- cgit v1.2.3 From 8398bcc6b3eb950a1242f6dc4cfb151b6b9238c3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 23:08:32 -0700 Subject: [PATCH] eliminate nswap and cnswap From: Matt Mackall The nswap and cnswap variables counters have never been incremented as Linux doesn't do task swapping. --- arch/alpha/kernel/osf_sys.c | 3 --- fs/proc/array.c | 4 ++-- include/linux/sched.h | 2 +- kernel/acct.c | 2 +- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/sys.c | 3 --- 7 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 34adfc76dd92..f725059fe47f 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1095,14 +1095,12 @@ osf_getrusage(int who, struct rusage32 *ru) jiffies_to_timeval32(current->stime, &r.ru_stime); r.ru_minflt = current->min_flt; r.ru_majflt = current->maj_flt; - r.ru_nswap = current->nswap; break; case RUSAGE_CHILDREN: jiffies_to_timeval32(current->cutime, &r.ru_utime); jiffies_to_timeval32(current->cstime, &r.ru_stime); r.ru_minflt = current->cmin_flt; r.ru_majflt = current->cmaj_flt; - r.ru_nswap = current->cnswap; break; default: jiffies_to_timeval32(current->utime + current->cutime, @@ -1111,7 +1109,6 @@ osf_getrusage(int who, struct rusage32 *ru) &r.ru_stime); r.ru_minflt = current->min_flt + current->cmin_flt; r.ru_majflt = current->maj_flt + current->cmaj_flt; - r.ru_nswap = current->nswap + current->cnswap; break; } diff --git a/fs/proc/array.c b/fs/proc/array.c index ac9ccac5d1ee..ae90151e45ae 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -392,8 +392,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer) sigign .sig[0] & 0x7fffffffUL, sigcatch .sig[0] & 0x7fffffffUL, wchan, - task->nswap, - task->cnswap, + 0UL, + 0UL, task->exit_signal, task_cpu(task), task->rt_priority, diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a1229121123..22080f919266 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -436,7 +436,7 @@ struct task_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ u64 start_time; /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; diff --git a/kernel/acct.c b/kernel/acct.c index b417066778a7..8e32413c41f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -376,7 +376,7 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->min_flt); ac.ac_majflt = encode_comp_t(current->maj_flt); - ac.ac_swaps = encode_comp_t(current->nswap); + ac.ac_swaps = encode_comp_t(0); ac.ac_exitcode = exitcode; /* diff --git a/kernel/exit.c b/kernel/exit.c index 810eebd77559..8157dbc037d6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -92,7 +92,6 @@ repeat: p->parent->cstime += p->stime + p->cstime; p->parent->cmin_flt += p->min_flt + p->cmin_flt; p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt; - p->parent->cnswap += p->nswap + p->cnswap; p->parent->cnvcsw += p->nvcsw + p->cnvcsw; p->parent->cnivcsw += p->nivcsw + p->cnivcsw; sched_exit(p); diff --git a/kernel/fork.c b/kernel/fork.c index 315a06125e65..da5213611496 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -513,7 +513,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->cmin_flt = tsk->cmaj_flt = 0; - tsk->nswap = tsk->cnswap = 0; tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0; tsk->mm = NULL; diff --git a/kernel/sys.c b/kernel/sys.c index 9d57482758f3..4d414d925889 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1532,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->nivcsw; r.ru_minflt = p->min_flt; r.ru_majflt = p->maj_flt; - r.ru_nswap = p->nswap; break; case RUSAGE_CHILDREN: jiffies_to_timeval(p->cutime, &r.ru_utime); @@ -1541,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->cnivcsw; r.ru_minflt = p->cmin_flt; r.ru_majflt = p->cmaj_flt; - r.ru_nswap = p->cnswap; break; default: jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime); @@ -1550,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) r.ru_nivcsw = p->nivcsw + p->cnivcsw; r.ru_minflt = p->min_flt + p->cmin_flt; r.ru_majflt = p->maj_flt + p->cmaj_flt; - r.ru_nswap = p->nswap + p->cnswap; break; } return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; -- cgit v1.2.3 From 424e44d11e16e6a64c307f9a29e6b2716319b202 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 23:15:59 -0700 Subject: [PATCH] fork vma ordering during fork From: Hugh Dickins First of six patches against 2.6.5-rc3, cleaning up mremap's move_vma, and fixing truncation orphan issues raised by Rajesh Venkatasubramanian. Originally done as part of the anonymous objrmap work on mremap move, but useful fixes now extracted for mainline. The mremap changes need some exposure in the -mm tree first, but the first (fork one-liner) is safe enough to go straight into 2.6.5. From: Rajesh Venkatasubramanian. Despite the comment that child vma should be inserted just after parent vma, 2.5.6 did exactly the reverse: thus a racing vmtruncate may free the child's ptes, then advance to the parent, and meanwhile copy_page_range has propagated more ptes from the parent to the child, leaving file pages still mapped after truncation. --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index da5213611496..fc25a3a15d0e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) /* insert tmp into the share list, just after mpnt */ down(&file->f_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); + list_add(&tmp->shared, &mpnt->shared); up(&file->f_mapping->i_shared_sem); } -- cgit v1.2.3 From 93d33a4885a483c708ccb7d24b56e0d5fef7bcab Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 23:17:38 -0700 Subject: [PATCH] laptop mode From: Bart Samwel Adds /proc/sys/vm/laptop-mode: a special knob which says "this is a laptop". In this mode the kernel will attempt to avoid spinning disks up. Algorithm: the idea is to hold dirty data in memory for a long time, but to flush everything which has been accumulated if the disk happens to spin up for other reasons. - Whenever a disk request completes (read or write), schedule a timer a few seconds hence. If the timer was already pending, reset it to a few seconds hence. - When the timer expires, write back the whole world. We use sync_filesystems() for this because it will force ext3 journal commits as well. - In balance_dirty_pages(), kick off background writeback when we hit the high threshold (dirty_ratio), not when we hit the low threshold. This has the effect of causing "lumpy" writeback which is something I spent a year fixing, but in laptop mode, it is desirable. - In try_to_free_pages(), only kick pdflush if the VM is getting into distress: we want to keep scanning for clean pages, deferring writeback. - In page reclaim, avoid writing back the odd random dirty page off the LRU: only start I/O if the scanning is working harder. The effect is to perform a sync() a few seconds after all I/O has ceased. The value which was written into /proc/sys/vm/laptop-mode determines, in seconds, the delay between the final I/O and the flush. Additionally, the patch adds tools which help answer the question "why the heck does my disk spin up all the time?". The user may set /proc/sys/vm/block_dump to a non-zero value and the kernel will print out information which will identify the process which is performing disk reads or which is dirtying pagecache. The user should probably disable syslogd before setting block-dump. --- Documentation/laptop-mode.txt | 665 ++++++++++++++++++++++++++++++++++++++++++ drivers/block/ll_rw_blk.c | 14 + fs/buffer.c | 2 + fs/fs-writeback.c | 3 + include/linux/sysctl.h | 2 + include/linux/writeback.h | 6 +- kernel/sysctl.c | 20 ++ mm/page-writeback.c | 69 ++++- mm/vmscan.c | 61 ++-- 9 files changed, 815 insertions(+), 27 deletions(-) create mode 100644 Documentation/laptop-mode.txt (limited to 'kernel') diff --git a/Documentation/laptop-mode.txt b/Documentation/laptop-mode.txt new file mode 100644 index 000000000000..9df8d2677bef --- /dev/null +++ b/Documentation/laptop-mode.txt @@ -0,0 +1,665 @@ +How to conserve battery power using laptop-mode +----------------------------------------------- + +Document Author: Bart Samwel (bart@samwel.tk) +Date created: January 2, 2004 +Last modified: April 3, 2004 + +Introduction +------------ + +Laptopmode is used to minimize the time that the hard disk needs to be spun up, +to conserve battery power on laptops. It has been reported to cause significant +power savings. + +Contents +-------- + +* Introduction +* The short story +* Caveats +* The details +* Tips & Tricks +* Control script +* ACPI integration +* Monitoring tool + + +The short story +--------------- + +If you just want to use it, run the laptop_mode control script (which is included +at the end of this document) as follows: + +# laptop_mode start + +Then set your harddisk spindown time to a relatively low value with hdparm: + +hdparm -S 4 /dev/hda + +The value -S 4 means 20 seconds idle time before spindown. Your harddisk will +now only spin up when a disk cache miss occurs, or at least once every 10 +minutes to write back any pending changes. + +To stop laptop_mode, run "laptop_mode stop". + + +Caveats +------- + +* The downside of laptop mode is that you have a chance of losing up + to 10 minutes of work. If you cannot afford this, don't use it! It's + wise to turn OFF laptop mode when you're almost out of battery -- + although this will make the battery run out faster, at least you'll + lose less work when it actually runs out. I'm still looking for someone + to submit instructions on how to turn off laptop mode when battery is low, + e.g., using ACPI events. I don't have a laptop myself, so if you do and + you care to contribute such instructions, please do. + +* Most desktop hard drives have a very limited lifetime measured in spindown + cycles, typically about 50.000 times (it's usually listed on the spec sheet). + Check your drive's rating, and don't wear down your drive's lifetime if you + don't need to. + +* If you mount some of your ext3/reiserfs filesystems with the -n option, then + the control script will not be able to remount them correctly. You must set + DO_REMOUNTS=0 in the control script, otherwise it will remount them with the + wrong options -- or it will fail because it cannot write to /etc/mtab. + +* If you have your filesystems listed as type "auto" in fstab, like I did, then + the control script will not recognize them as filesystems that need remounting. + +* If you have XFS, make SURE that you set the XFS_HZ value in the control script + correctly, to the value of HZ of your running kernel. Laptop mode will not + work correctly if it is set too low, and you may lose data if it is set too + high. The reason for this problem is that XFS does not export its sysctl + variables in centisecs (like most other subsystems do) but in "jiffies", + which is an internal kernel measure. Once this is fixed things will get better. + + +The details +----------- + +Laptop-mode is controlled by the flag /proc/sys/vm/laptop_mode. When this +flag is set, any physical disk read operation (that might have caused the +hard disk to spin up) causes Linux to flush all dirty blocks. The result +of this is that after a disk has spun down, it will not be spun up anymore +to write dirty blocks, because those blocks had already been written +immediately after the most recent read operation + +To increase the effectiveness of the laptop_mode strategy, the laptop_mode +control script increases dirty_expire_centisecs and dirty_writeback_centisecs in +/proc/sys/vm to about 10 minutes (by default), which means that pages that are +dirtied are not forced to be written to disk as often. The control script also +changes the dirty background ratio, so that background writeback of dirty pages +is not done anymore. Combined with a higher commit value (also 10 minutes) for +ext3 or ReiserFS filesystems (also done automatically by the control script), +this results in concentration of disk activity in a small time interval which +occurs only once every 10 minutes, or whenever the disk is forced to spin up by +a cache miss. The disk can then be spun down in the periods of inactivity. + +If you want to find out which process caused the disk to spin up, you can +gather information by setting the flag /proc/sys/vm/block_dump. When this flag +is set, Linux reports all disk read and write operations that take place, and +all block dirtyings done to files. This makes it possible to debug why a disk +needs to spin up, and to increase battery life even more. The output of +block_dump is written to the kernel output, and it can be retrieved using +"dmesg". When you use block_dump, you may want to turn off klogd, otherwise +the output of block_dump will be logged, causing disk activity that is not +normally there. + +If 10 minutes is too much or too little downtime for you, you can configure +this downtime as follows. In the control script, set the MAX_AGE value to the +maximum number of seconds of disk downtime that you would like. You should +then set your filesystem's commit interval to the same value. The dirty ratio +is also configurable from the control script. + +If you don't like the idea of the control script remounting your filesystems +for you, you can change DO_REMOUNTS to 0 in the script. + +Thanks to Kiko Piris, the control script can be used to enable laptop mode on +both the Linux 2.4 and 2.6 series. + + +Tips & Tricks +------------- + +* Bartek Kania reports getting up to 50 minutes of extra battery life (on top + of his regular 3 to 3.5 hours) using very aggressive power management (hdparm + -B1) and a spindown time of 5 seconds (hdparm -S1). + +* You can spin down the disk while playing MP3, by setting the disk readahead + to 8MB (hdparm -a 16384). Effectively, the disk will read a complete MP3 at + once, and will then spin down while the MP3 is playing. (Thanks to Bartek + Kania.) + +* Drew Scott Daniels observed: "I don't know why, but when I decrease the number + of colours that my display uses it consumes less battery power. I've seen + this on powerbooks too. I hope that this is a piece of information that + might be useful to the Laptop Mode patch or it's users." + +* One thing which will cause disks to spin up is not-present application + and dynamic library text pages. The kernel will load program text off disk + on-demand, so each time you invoke an application feature for the first + time, the kernel needs to spin the disk up to go and fetch that part of the + application. + + So it is useful to increase the disk readahead parameter greatly, so that + the kernel will pull all of the executable's pages into memory on the first + pagefault. + + The supplied script does this. + +* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the + file after every logging. When you're using laptop-mode and your disk doesn't + spin down, this is a likely culprit. + +* Richard Atterer observed that laptop mode does not work well with noflushd + (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode + from doing its thing. + + +Control script +-------------- + +Please note that this control script works for the Linux 2.4 and 2.6 series. + +--------------------CONTROL SCRIPT BEGIN------------------------------------------ +#! /bin/sh + +# start or stop laptop_mode, best run by a power management daemon when +# ac gets connected/disconnected from a laptop +# +# install as /sbin/laptop_mode +# +# Contributors to this script: Kiko Piris +# Bart Samwel +# Micha Feigin +# Andrew Morton +# Dax Kelson +# +# Original Linux 2.4 version by: Jens Axboe + +# Remove an option (the first parameter) of the form option= from +# a mount options string (the rest of the parameters). +parse_mount_opts () { + OPT="$1" + shift + echo "$*" | \ + sed 's/.*/,&,/' | \ + sed 's/,'"$OPT"'=[0-9]*,/,/g' | \ + sed 's/,,*/,/g' | \ + sed 's/^,//' | \ + sed 's/,$//' | \ + cat - +} + +# Remove an option (the first parameter) without any arguments from +# a mount option string (the rest of the parameters). +parse_nonumber_mount_opts () { + OPT="$1" + shift + echo "$*" | \ + sed 's/.*/,&,/' | \ + sed 's/,'"$OPT"',/,/g' | \ + sed 's/,,*/,/g' | \ + sed 's/^,//' | \ + sed 's/,$//' | \ + cat - +} + +# Find out the state of a yes/no option (e.g. "atime"/"noatime") in +# fstab for a given filesystem, and use this state to replace the +# value of the option in another mount options string. The device +# is the first argument, the option name the second, and the default +# value the third. The remainder is the mount options string. +# +# Example: +# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime +# +# If fstab contains, say, "rw" for this filesystem, then the result +# will be "defaults,atime". +parse_yesno_opts_wfstab () { + L_DEV=$1 + shift + OPT=$1 + shift + DEF_OPT=$1 + shift + L_OPTS="$*" + PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)" + PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)" + # Watch for a default atime in fstab + FSTAB_OPTS="$(cat /etc/fstab | sed 's/ / /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')" + if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT")" ] ; then + # option not specified in fstab -- choose the default. + echo "$PARSEDOPTS1,$DEF_OPT" + else + # option specified in fstab: extract the value and use it + if [ -z "$(echo "$FSTAB_OPTS" | grep "no$OPT")" ] ; then + # no$OPT not found -- so we must have $OPT. + echo "$PARSEDOPTS1,$OPT" + else + echo "$PARSEDOPTS1,no$OPT" + fi + fi +} + +# Find out the state of a numbered option (e.g. "commit=NNN") in +# fstab for a given filesystem, and use this state to replace the +# value of the option in another mount options string. The device +# is the first argument, and the option name the second. The +# remainder is the mount options string in which the replacement +# must be done. +# +# Example: +# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7 +# +# If fstab contains, say, "commit=3,rw" for this filesystem, then the +# result will be "rw,commit=3". +parse_mount_opts_wfstab () { + L_DEV=$1 + shift + OPT=$1 + shift + L_OPTS="$*" + + PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)" + # Watch for a default commit in fstab + FSTAB_OPTS="$(cat /etc/fstab | sed 's/ / /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')" + if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT=")" ] ; then + # option not specified in fstab: set it to 0 + echo "$PARSEDOPTS1,$OPT=0" + else + # option specified in fstab: extract the value, and use it + echo -n "$PARSEDOPTS1,$OPT=" + echo "$FSTAB_OPTS" | \ + sed 's/.*/,&,/' | \ + sed 's/.*,'"$OPT"'=//' | \ + sed 's/,.*//' | \ + cat - + fi +} + +KLEVEL="$(uname -r | cut -c1-3)" +case "$KLEVEL" in + "2.4"|"2.6") + true + ;; + *) + echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" + exit 1 + ;; +esac + +# Shall we remount journaled fs. with appropiate commit interval? (1=yes) +DO_REMOUNTS=1 + +# age time, in seconds. should be put into a sysconfig file +MAX_AGE=600 + +# Dirty synchronous ratio. At this percentage of dirty pages the process which +# calls write() does its own writeback +DIRTY_RATIO=40 + +# +# Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been +# exceeded, the kernel will wake pdflush which will then reduce the amount +# of dirty memory to dirty_background_ratio. Set this nice and low, so once +# some writeout has commenced, we do a lot of it. +# +DIRTY_BACKGROUND_RATIO=5 + +READAHEAD=4096 # kilobytes + +# kernel default dirty buffer age +DEF_AGE=30 +DEF_UPDATE=5 +DEF_DIRTY_BACKGROUND_RATIO=10 +DEF_DIRTY_RATIO=40 +DEF_XFS_AGE_BUFFER=15 +DEF_XFS_SYNC_INTERVAL=30 + +# This must be adjusted manually to the value of HZ in the running kernel, +# until the XFS people change their external interfaces to work in centisecs +# like the rest of the external world. Unfortunately this cannot be automated. :( +XFS_HZ=1000 + +if [ ! -e /proc/sys/vm/laptop_mode ]; then + echo "Kernel is not patched with laptop_mode patch." + exit 1 +fi + +if [ ! -w /proc/sys/vm/laptop_mode ]; then + echo "You do not have enough privileges to enable laptop_mode." + exit 1 +fi + +case "$1" in + start) + AGE=$((100*$MAX_AGE)) + XFS_AGE=$(($XFS_HZ*$MAX_AGE)) + echo -n "Starting laptop_mode" + + if [ -d /proc/sys/vm/pagebuf ] ; then + # This only needs to be set, not reset -- it is only used when + # laptop mode is enabled. + echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age + echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval + elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then + # The same goes for these. + echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer + echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval + elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then + # But not for these -- they are also used in normal + # operation. + echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer + echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval + fi + + case "$KLEVEL" in + "2.4") + echo "1" > /proc/sys/vm/laptop_mode + echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo "5" > /proc/sys/vm/laptop_mode + echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ]; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + PARSEDOPTS="$(parse_mount_opts "$OPTS")" + case "$FST" in + "ext3"|"reiserfs") + PARSEDOPTS="$(parse_mount_opts commit "$OPTS")" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE,noatime + ;; + "xfs") + mount $DEV -t $FST $MP -o remount,$OPTS,noatime + ;; + esac + if [ -b $DEV ] ; then + blockdev --setra $(($READAHEAD * 2)) $DEV + fi + done + fi + echo "." + ;; + stop) + U_AGE=$((100*$DEF_UPDATE)) + B_AGE=$((100*$DEF_AGE)) + echo -n "Stopping laptop_mode" + echo "0" > /proc/sys/vm/laptop_mode + if [ -f /proc/sys/fs/xfs/age_buffer ] && [ ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then + # These need to be restored though, if there are no lm_*. + echo "$(($XFS_HZ*$DEF_XFS_AGE_BUFFER))" > /proc/sys/fs/xfs/age_buffer + echo "$(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL))" > /proc/sys/fs/xfs/sync_interval + fi + case "$KLEVEL" in + "2.4") + echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush + ;; + "2.6") + echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs + echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs + echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio + echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio + ;; + esac + if [ $DO_REMOUNTS -eq 1 ]; then + cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do + # Reset commit and atime options to defaults. + case "$FST" in + "ext3"|"reiserfs") + PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)" + PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS + ;; + "xfs") + PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)" + mount $DEV -t $FST $MP -o remount,$PARSEDOPTS + ;; + esac + if [ -b $DEV ] ; then + blockdev --setra 256 $DEV + fi + done + fi + echo "." + ;; + *) + echo "Usage: $0 {start|stop}" + ;; + +esac + +exit 0 + +--------------------CONTROL SCRIPT END-------------------------------------------- + + +ACPI integration +---------------- + +Dax Kelson submitted this so that the ACPI acpid daemon will +kick off the laptop_mode script and run hdparm. + +---------------------------/etc/acpi/events/ac_adapter BEGIN------------------------------------------- +event=ac_adapter +action=/etc/acpi/actions/battery.sh +---------------------------/etc/acpi/events/ac_adapter END------------------------------------------- + +---------------------------/etc/acpi/actions/battery.sh BEGIN------------------------------------------- +#!/bin/sh + +# cpu throttling +# cat /proc/acpi/processor/CPU0/throttling for more info +ACAD_THR=0 +BATT_THR=2 + +# spindown time for HD (man hdparm for valid values) +# I prefer 2 hours for acad and 20 seconds for batt +ACAD_HD=244 +BATT_HD=4 + +# ac/battery event handler + +status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/AC/state` + +case $status in + "on-line") + echo "Setting HD spindown to 2 hours" + /sbin/laptop-mode stop + /sbin/hdparm -S $ACAD_HD /dev/hda > /dev/null 2>&1 + /sbin/hdparm -B 255 /dev/hda > /dev/null 2>&1 + #echo -n $ACAD_CPU:$ACAD_THR > /proc/acpi/processor/CPU0/limit + exit 0 + ;; + "off-line") + echo "Setting HD spindown to 20 seconds" + /sbin/laptop-mode start + /sbin/hdparm -S $BATT_HD /dev/hda > /dev/null 2>&1 + /sbin/hdparm -B 1 /dev/hda > /dev/null 2>&1 + #echo -n $BATT_CPU:$BATT_THR > /proc/acpi/processor/CPU0/limit + exit 0 + ;; +esac +---------------------------/etc/acpi/actions/battery.sh END------------------------------------------- + +Monitoring tool +--------------- + +Bartek Kania submitted this, it can be used to measure how much time your disk +spends spun up/down. + +---------------------------dslm.c BEGIN------------------------------------------- +/* + * Simple Disk Sleep Monitor + * by Bartek Kania + * Licenced under the GPL + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DEBUG +#define D(x) x +#else +#define D(x) +#endif + +int endit = 0; + +/* Check if the disk is in powersave-mode + * Most of the code is stolen from hdparm. + * 1 = active, 0 = standby/sleep, -1 = unknown */ +int check_powermode(int fd) +{ + unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0}; + int state; + + if (ioctl(fd, HDIO_DRIVE_CMD, &args) + && (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */ + && ioctl(fd, HDIO_DRIVE_CMD, &args)) { + if (errno != EIO || args[0] != 0 || args[1] != 0) { + state = -1; /* "unknown"; */ + } else + state = 0; /* "sleeping"; */ + } else { + state = (args[2] == 255) ? 1 : 0; + } + D(printf(" drive state is: %d\n", state)); + + return state; +} + +char *state_name(int i) +{ + if (i == -1) return "unknown"; + if (i == 0) return "sleeping"; + if (i == 1) return "active"; + + return "internal error"; +} + +char *myctime(time_t time) +{ + char *ts = ctime(&time); + ts[strlen(ts) - 1] = 0; + + return ts; +} + +void measure(int fd) +{ + time_t start_time; + int last_state; + time_t last_time; + int curr_state; + time_t curr_time = 0; + time_t time_diff; + time_t active_time = 0; + time_t sleep_time = 0; + time_t unknown_time = 0; + time_t total_time = 0; + int changes = 0; + float tmp; + + printf("Starting measurements\n"); + + last_state = check_powermode(fd); + start_time = last_time = time(0); + printf(" System is in state %s\n\n", state_name(last_state)); + + while(!endit) { + sleep(1); + curr_state = check_powermode(fd); + + if (curr_state != last_state || endit) { + changes++; + curr_time = time(0); + time_diff = curr_time - last_time; + + if (last_state == 1) active_time += time_diff; + else if (last_state == 0) sleep_time += time_diff; + else unknown_time += time_diff; + + last_state = curr_state; + last_time = curr_time; + + printf("%s: State-change to %s\n", myctime(curr_time), + state_name(curr_state)); + } + } + changes--; /* Compensate for SIGINT */ + + total_time = time(0) - start_time; + printf("\nTotal running time: %lus\n", curr_time - start_time); + printf(" State changed %d times\n", changes); + + tmp = (float)sleep_time / (float)total_time * 100; + printf(" Time in sleep state: %lus (%.2f%%)\n", sleep_time, tmp); + tmp = (float)active_time / (float)total_time * 100; + printf(" Time in active state: %lus (%.2f%%)\n", active_time, tmp); + tmp = (float)unknown_time / (float)total_time * 100; + printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp); +} + +void ender(int s) +{ + endit = 1; +} + +void usage() +{ + puts("usage: dslm [-w