From 243c64b2cfea7e49e074c80db65fa7b90d765c6f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:39:51 -0700 Subject: [PATCH] feed devfs through Lindent Nobody seems to have any outstanding work against devfs, so... --- include/linux/devfs_fs.h | 32 +++++++++++++++----------------- include/linux/devfs_fs_kernel.h | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/devfs_fs.h b/include/linux/devfs_fs.h index 48da59012021..de236f431877 100644 --- a/include/linux/devfs_fs.h +++ b/include/linux/devfs_fs.h @@ -22,22 +22,20 @@ #define DEVFSD_NOTIFY_CREATE 6 #define DEVFSD_NOTIFY_DELETE 7 -#define DEVFS_PATHLEN 1024 /* Never change this otherwise the - binary interface will change */ - -struct devfsd_notify_struct -{ /* Use native C types to ensure same types in kernel and user space */ - unsigned int type; /* DEVFSD_NOTIFY_* value */ - unsigned int mode; /* Mode of the inode or device entry */ - unsigned int major; /* Major number of device entry */ - unsigned int minor; /* Minor number of device entry */ - unsigned int uid; /* Uid of process, inode or device entry */ - unsigned int gid; /* Gid of process, inode or device entry */ - unsigned int overrun_count; /* Number of lost events */ - unsigned int namelen; /* Number of characters not including '\0' */ - /* The device name MUST come last */ - char devname[DEVFS_PATHLEN]; /* This will be '\0' terminated */ +#define DEVFS_PATHLEN 1024 /* Never change this otherwise the + binary interface will change */ + +struct devfsd_notify_struct { /* Use native C types to ensure same types in kernel and user space */ + unsigned int type; /* DEVFSD_NOTIFY_* value */ + unsigned int mode; /* Mode of the inode or device entry */ + unsigned int major; /* Major number of device entry */ + unsigned int minor; /* Minor number of device entry */ + unsigned int uid; /* Uid of process, inode or device entry */ + unsigned int gid; /* Gid of process, inode or device entry */ + unsigned int overrun_count; /* Number of lost events */ + unsigned int namelen; /* Number of characters not including '\0' */ + /* The device name MUST come last */ + char devname[DEVFS_PATHLEN]; /* This will be '\0' terminated */ }; - -#endif /* _LINUX_DEVFS_FS_H */ +#endif /* _LINUX_DEVFS_FS_H */ diff --git a/include/linux/devfs_fs_kernel.h b/include/linux/devfs_fs_kernel.h index 16c78f54f427..89810e73d256 100644 --- a/include/linux/devfs_fs_kernel.h +++ b/include/linux/devfs_fs_kernel.h @@ -12,18 +12,18 @@ #ifdef CONFIG_DEVFS_FS extern int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) - __attribute__((format (printf, 3, 4))); + __attribute__ ((format(printf, 3, 4))); extern int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) - __attribute__((format (printf, 3, 4))); + __attribute__ ((format(printf, 3, 4))); extern int devfs_mk_symlink(const char *name, const char *link); extern int devfs_mk_dir(const char *fmt, ...) - __attribute__((format (printf, 1, 2))); + __attribute__ ((format(printf, 1, 2))); extern void devfs_remove(const char *fmt, ...) - __attribute__((format (printf, 1, 2))); + __attribute__ ((format(printf, 1, 2))); extern int devfs_register_tape(const char *name); extern void devfs_unregister_tape(int num); extern void mount_devfs_fs(void); -#else /* CONFIG_DEVFS_FS */ +#else /* CONFIG_DEVFS_FS */ static inline int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) { return 0; @@ -32,9 +32,9 @@ static inline int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) { return 0; } -static inline int devfs_mk_symlink (const char *name, const char *link) +static inline int devfs_mk_symlink(const char *name, const char *link) { - return 0; + return 0; } static inline int devfs_mk_dir(const char *fmt, ...) { @@ -43,16 +43,16 @@ static inline int devfs_mk_dir(const char *fmt, ...) static inline void devfs_remove(const char *fmt, ...) { } -static inline int devfs_register_tape (const char *name) +static inline int devfs_register_tape(const char *name) { - return -1; + return -1; } static inline void devfs_unregister_tape(int num) { } -static inline void mount_devfs_fs (void) +static inline void mount_devfs_fs(void) { - return; + return; } -#endif /* CONFIG_DEVFS_FS */ -#endif /* _LINUX_DEVFS_FS_KERNEL_H */ +#endif /* CONFIG_DEVFS_FS */ +#endif /* _LINUX_DEVFS_FS_KERNEL_H */ -- cgit v1.2.3 From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:40:05 -0700 Subject: [PATCH] generalise system_running From: Olof Johansson It's currently a boolean, but that means that system_running goes to zero again when shutting down. So we then use code (in the page allocator) which is only designed to be used during bootup - it is marked __init. So we need to be able to distinguish early boot state from late shutdown state. Rename system_running to system_state and give it the three appropriate states. --- arch/ppc/platforms/pmac_nvram.c | 8 ++++---- include/linux/kernel.h | 8 +++++++- init/main.c | 8 ++------ kernel/kmod.c | 2 +- kernel/printk.c | 3 ++- kernel/sched.c | 3 ++- kernel/sys.c | 8 ++++---- mm/page_alloc.c | 2 +- 8 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c index f381f3f745f9..3b3f984fb929 100644 --- a/arch/ppc/platforms/pmac_nvram.c +++ b/arch/ppc/platforms/pmac_nvram.c @@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM, (addr >> 8) & 0xff, addr & 0xff)) return 0xff; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); @@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM, (addr >> 8) & 0xff, addr & 0xff, val)) return; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e11e79199357..c1171e77c76b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -109,9 +109,15 @@ static inline void console_verbose(void) extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_on_oops; -extern int system_running; +extern int system_state; /* See values below */ extern int tainted; extern const char *print_tainted(void); + +/* Values used for system_state */ +#define SYSTEM_BOOTING 0 +#define SYSTEM_RUNNING 1 +#define SYSTEM_SHUTDOWN 2 + #define TAINT_PROPRIETARY_MODULE (1<<0) #define TAINT_FORCED_MODULE (1<<1) #define TAINT_UNSAFE_SMP (1<<2) diff --git a/init/main.c b/init/main.c index 9d1ed1de14c5..348ce7db30f3 100644 --- a/init/main.c +++ b/init/main.c @@ -94,11 +94,7 @@ extern void driver_init(void); extern void tc_init(void); #endif -/* - * Are we up and running (ie do we have all the infrastructure - * set up) - */ -int system_running; +int system_state; /* SYSTEM_BOOTING/RUNNING/SHUTDOWN */ /* * Boot command-line arguments @@ -613,7 +609,7 @@ static int init(void * unused) */ free_initmem(); unlock_kernel(); - system_running = 1; + system_state = SYSTEM_RUNNING; if (sys_open("/dev/console", O_RDWR, 0) < 0) printk("Warning: unable to open an initial console.\n"); diff --git a/kernel/kmod.c b/kernel/kmod.c index 5261de82029b..0002fcd4c554 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); - if (!system_running) + if (system_state != SYSTEM_RUNNING) return -EBUSY; if (path[0] == '\0') diff --git a/kernel/printk.c b/kernel/printk.c index a7be1f922f34..5f2b3c9bbd6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && !system_running) { + if (!cpu_online(smp_processor_id()) && + system_state != SYSTEM_RUNNING) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff --git a/kernel/sched.c b/kernel/sched.c index d5f21712ffbb..9e19d4c0d4a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line) #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && system_running) { + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/kernel/sys.c b/kernel/sys.c index 33a14e13079e..bc498b12edcc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user switch (cmd) { case LINUX_REBOOT_CMD_RESTART: notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system.\n"); machine_restart(NULL); @@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user buffer[sizeof(buffer) - 1] = '\0'; notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d035d836c15..9764a4e78e45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde struct page * page; #ifdef CONFIG_NUMA - if (unlikely(!system_running)) + if (unlikely(system_state == SYSTEM_BOOTING)) return get_boot_pages(gfp_mask, order); #endif page = alloc_pages(gfp_mask, order); -- cgit v1.2.3 From efffe9c8536bf9ee28f2f381bd285824bedcdbcd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:40:55 -0700 Subject: [PATCH] Fix VT open/close race The race is that con_close() can sleep, and drops the BKL while tty->count==1. But another thread can come into init_dev() and will take a new ref against the tty and start using it. But con_close() doesn't notice that new ref and proceeds to null out tty->driver_data while someone else is using the resurrected tty. So the patch serialises con_close() against init_dev() with tty_sem. Here's a test app which reproduced the oops instantly on 2-way. It realy needs to be run against all tty-capable devices. /* * Run this against a tty which nobody currently has open, such as /dev/tty9 */ #include #include #include #include #include #include void doit(char *filename) { int fd,x; fd = open(filename, O_RDWR); if (fd < 0) { perror("open"); exit(1); } ioctl(fd, KDKBDREP, &x); close(fd); } main(int argc, char *argv[]) { char *filename = argv[1]; for ( ; ; ) doit(filename); } --- drivers/char/tty_io.c | 2 +- drivers/char/vt.c | 14 ++++++++++++++ include/linux/tty.h | 3 +++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 6bb5ae7e41a5..0ba52078f637 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -123,7 +123,7 @@ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ struct tty_ldisc ldiscs[NR_LDISCS]; /* line disc dispatch table */ /* Semaphore to protect creating and releasing a tty */ -static DECLARE_MUTEX(tty_sem); +DECLARE_MUTEX(tty_sem); #ifdef CONFIG_UNIX98_PTYS extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ diff --git a/drivers/char/vt.c b/drivers/char/vt.c index a5ddfc5ac9c1..2febed52e19f 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -2480,8 +2480,16 @@ static int con_open(struct tty_struct *tty, struct file *filp) return ret; } +/* + * We take tty_sem in here to prevent another thread from coming in via init_dev + * and taking a ref against the tty while we're in the process of forgetting + * about it and cleaning things up. + * + * This is because vcs_remove_devfs() can sleep and will drop the BKL. + */ static void con_close(struct tty_struct *tty, struct file *filp) { + down(&tty_sem); acquire_console_sem(); if (tty && tty->count == 1) { struct vt_struct *vt; @@ -2492,9 +2500,15 @@ static void con_close(struct tty_struct *tty, struct file *filp) tty->driver_data = 0; release_console_sem(); vcs_remove_devfs(tty); + up(&tty_sem); + /* + * tty_sem is released, but we still hold BKL, so there is + * still exclusion against init_dev() + */ return; } release_console_sem(); + up(&tty_sem); } static void vc_init(unsigned int currcons, unsigned int rows, diff --git a/include/linux/tty.h b/include/linux/tty.h index fbcc401e8b28..6e61f3b27157 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -363,6 +363,9 @@ extern void tty_flip_buffer_push(struct tty_struct *tty); extern int tty_get_baud_rate(struct tty_struct *tty); extern int tty_termios_baud_rate(struct termios *termios); +struct semaphore; +extern struct semaphore tty_sem; + /* n_tty.c */ extern struct tty_ldisc tty_ldisc_N_TTY; -- cgit v1.2.3 From ee28db843649533f5650186251ae4a8bd49a3da9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:07 -0700 Subject: [PATCH] i4l: kernelcapi receive workqueue and locking rework From: Armin Schindler With this patch the ISDN kernel CAPI code uses a per application workqueue with proper locking to prevent message re-ordering due to the fact a workqueue may run on another CPU at the same time. Also some locks for internal data is added. Removed global recv_queue work, use per application workqueue. Added proper locking mechanisms for application, controller and application workqueue function. Increased max. number of possible applications and controllers. --- drivers/isdn/capi/kcapi.c | 96 ++++++++++++++++++++++++++++++++-------------- include/linux/kernelcapi.h | 11 ++++-- 2 files changed, 75 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 064dc3003716..8524997b10b6 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -1,4 +1,4 @@ -/* $Id: kcapi.c,v 1.1.2.7 2004/03/16 08:01:47 armin Exp $ +/* $Id: kcapi.c,v 1.1.2.8 2004/03/26 19:57:20 armin Exp $ * * Kernel CAPI 2.0 Module * @@ -31,7 +31,7 @@ #include #endif -static char *revision = "$Revision: 1.1.2.7 $"; +static char *revision = "$Revision: 1.1.2.8 $"; /* ------------------------------------------------------------- */ @@ -63,13 +63,13 @@ static char capi_manufakturer[64] = "AVM Berlin"; LIST_HEAD(capi_drivers); rwlock_t capi_drivers_list_lock = RW_LOCK_UNLOCKED; +static rwlock_t application_lock = RW_LOCK_UNLOCKED; +static DECLARE_MUTEX(controller_sem); + struct capi20_appl *capi_applications[CAPI_MAXAPPL]; struct capi_ctr *capi_cards[CAPI_MAXCONTR]; static int ncards; -static struct sk_buff_head recv_queue; - -static struct work_struct tq_recv_notify; /* -------- controller ref counting -------------------------------------- */ @@ -174,7 +174,7 @@ static void notify_up(u32 contr) for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = get_capi_appl_by_nr(applid); - if (ap && ap->callback) + if (ap && ap->callback && !ap->release_in_progress) ap->callback(KCI_CONTRUP, contr, &card->profile); } } @@ -192,7 +192,7 @@ static void notify_down(u32 contr) for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = get_capi_appl_by_nr(applid); - if (ap && ap->callback) + if (ap && ap->callback && !ap->release_in_progress) ap->callback(KCI_CONTRDOWN, contr, 0); } } @@ -237,38 +237,39 @@ static int notify_push(unsigned int cmd, u32 controller, u16 applid, u32 ncci) /* -------- Receiver ------------------------------------------ */ -static void recv_handler(void *dummy) +static void recv_handler(void *_ap) { struct sk_buff *skb; - struct capi20_appl *ap; + struct capi20_appl *ap = (struct capi20_appl *) _ap; - while ((skb = skb_dequeue(&recv_queue)) != 0) { - ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data)); - if (!ap) { - printk(KERN_ERR "kcapi: recv_handler: applid %d ? (%s)\n", - CAPIMSG_APPID(skb->data), capi_message2str(skb->data)); - kfree_skb(skb); - continue; - } + if ((!ap) || (ap->release_in_progress)) + return; + down(&ap->recv_sem); + while ((skb = skb_dequeue(&ap->recv_queue))) { if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND) ap->nrecvdatapkt++; else ap->nrecvctlpkt++; + ap->recv_message(ap, skb); } + up(&ap->recv_sem); } void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb) { + struct capi20_appl *ap; int showctl = 0; u8 cmd, subcmd; + unsigned long flags; if (card->cardstate != CARD_RUNNING) { printk(KERN_INFO "kcapi: controller %d not active, got: %s", card->cnr, capi_message2str(skb->data)); goto error; } + cmd = CAPIMSG_COMMAND(skb->data); subcmd = CAPIMSG_SUBCOMMAND(skb->data); if (cmd == CAPI_DATA_B3 && subcmd == CAPI_IND) { @@ -293,8 +294,19 @@ void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *s } } - skb_queue_tail(&recv_queue, skb); - schedule_work(&tq_recv_notify); + + read_lock_irqsave(&application_lock, flags); + ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data)); + if ((!ap) || (ap->release_in_progress)) { + read_unlock_irqrestore(&application_lock, flags); + printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s)\n", + CAPIMSG_APPID(skb->data), capi_message2str(skb->data)); + goto error; + } + skb_queue_tail(&ap->recv_queue, skb); + schedule_work(&ap->recv_work); + read_unlock_irqrestore(&application_lock, flags); + return; error: @@ -310,11 +322,13 @@ void capi_ctr_ready(struct capi_ctr * card) card->cardstate = CARD_RUNNING; + down(&controller_sem); for (appl = 1; appl <= CAPI_MAXAPPL; appl++) { ap = get_capi_appl_by_nr(appl); - if (!ap) continue; + if (!ap || ap->release_in_progress) continue; register_appl(card, appl, &ap->rparam); } + up(&controller_sem); printk(KERN_NOTICE "kcapi: card %d \"%s\" ready.\n", card->cnr, card->name); @@ -342,7 +356,7 @@ void capi_ctr_reseted(struct capi_ctr * card) for (appl = 1; appl <= CAPI_MAXAPPL; appl++) { struct capi20_appl *ap = get_capi_appl_by_nr(appl); - if (!ap) + if (!ap || ap->release_in_progress) continue; capi_ctr_put(card); @@ -382,16 +396,21 @@ attach_capi_ctr(struct capi_ctr *card) { int i; + down(&controller_sem); + for (i = 0; i < CAPI_MAXCONTR; i++) { if (capi_cards[i] == NULL) break; } if (i == CAPI_MAXCONTR) { + up(&controller_sem); printk(KERN_ERR "kcapi: out of controller slots\n"); return -EBUSY; } capi_cards[i] = card; + up(&controller_sem); + card->nrecvctlpkt = 0; card->nrecvdatapkt = 0; card->nsentctlpkt = 0; @@ -480,18 +499,23 @@ u16 capi20_register(struct capi20_appl *ap) { int i; u16 applid; + unsigned long flags; DBG(""); if (ap->rparam.datablklen < 128) return CAPI_LOGBLKSIZETOSMALL; + write_lock_irqsave(&application_lock, flags); + for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { if (capi_applications[applid - 1] == NULL) break; } - if (applid > CAPI_MAXAPPL) + if (applid > CAPI_MAXAPPL) { + write_unlock_irqrestore(&application_lock, flags); return CAPI_TOOMANYAPPLS; + } ap->applid = applid; capi_applications[applid - 1] = ap; @@ -501,12 +525,21 @@ u16 capi20_register(struct capi20_appl *ap) ap->nsentctlpkt = 0; ap->nsentdatapkt = 0; ap->callback = 0; + init_MUTEX(&ap->recv_sem); + skb_queue_head_init(&ap->recv_queue); + INIT_WORK(&ap->recv_work, recv_handler, (void *)ap); + ap->release_in_progress = 0; + + write_unlock_irqrestore(&application_lock, flags); + down(&controller_sem); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING) continue; register_appl(capi_cards[i], applid, &ap->rparam); } + up(&controller_sem); + if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d up\n", applid); } @@ -519,15 +552,26 @@ EXPORT_SYMBOL(capi20_register); u16 capi20_release(struct capi20_appl *ap) { int i; + unsigned long flags; DBG("applid %#x", ap->applid); + write_lock_irqsave(&application_lock, flags); + ap->release_in_progress = 1; + capi_applications[ap->applid - 1] = NULL; + write_unlock_irqrestore(&application_lock, flags); + + down(&controller_sem); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING) continue; release_appl(capi_cards[i], ap->applid); } - capi_applications[ap->applid - 1] = NULL; + up(&controller_sem); + + flush_scheduled_work(); + skb_queue_purge(&ap->recv_queue); + if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d down\n", ap->applid); } @@ -547,7 +591,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) if (ncards == 0) return CAPI_REGNOTINSTALLED; - if (ap->applid == 0) + if ((ap->applid == 0) || ap->release_in_progress) return CAPI_ILLAPPNR; if (skb->len < 12 || !capi_cmd_valid(CAPIMSG_COMMAND(skb->data)) @@ -925,10 +969,6 @@ static int __init kcapi_init(void) char *p; char rev[32]; - skb_queue_head_init(&recv_queue); - - INIT_WORK(&tq_recv_notify, recv_handler, NULL); - kcapi_proc_init(); if ((p = strchr(revision, ':')) != 0 && p[1]) { diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h index b982d5b77ae9..1d4b1b15d0b8 100644 --- a/include/linux/kernelcapi.h +++ b/include/linux/kernelcapi.h @@ -10,10 +10,8 @@ #ifndef __KERNELCAPI_H__ #define __KERNELCAPI_H__ -#include - -#define CAPI_MAXAPPL 128 /* maximum number of applications */ -#define CAPI_MAXCONTR 16 /* maximum number of controller */ +#define CAPI_MAXAPPL 240 /* maximum number of applications */ +#define CAPI_MAXCONTR 32 /* maximum number of controller */ #define CAPI_MAXDATAWINDOW 8 @@ -47,6 +45,7 @@ typedef struct kcapi_carddef { #ifdef __KERNEL__ +#include #include #define KCI_CONTRUP 0 /* arg: struct capi_profile */ @@ -63,6 +62,10 @@ struct capi20_appl { unsigned long nrecvdatapkt; unsigned long nsentctlpkt; unsigned long nsentdatapkt; + struct semaphore recv_sem; + struct sk_buff_head recv_queue; + struct work_struct recv_work; + int release_in_progress; /* ugly hack to allow for notification of added/removed * controllers. The Right Way (tm) is known. XXX -- cgit v1.2.3 From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:20 -0700 Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions From: William Lee Irwin III This addresses the issue with get_wchan() that the various functions acting as scheduling-related primitives are not, in fact, contiguous in the text segment. It creates an ELF section for scheduling primitives to be placed in, and places currently-detected (i.e. skipped during stack decoding) scheduling primitives and others like io_schedule() and down(), which are currently missed by get_wchan() code, into this section also. The net effects are more reliability of get_wchan()'s results and the new ability, made use of by this code, to arbitrarily place scheduling primitives in the source code without disturbing get_wchan()'s accuracy. Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the invasiveness of the patch were incorporated during prior rounds of review. I've at least tried to sweep all arches in this patch. --- arch/alpha/kernel/process.c | 2 -- arch/alpha/kernel/semaphore.c | 9 ++++---- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/process.c | 2 -- arch/arm/kernel/semaphore.c | 8 ++++--- arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm26/kernel/process.c | 2 -- arch/arm26/kernel/semaphore.c | 8 ++++--- arch/arm26/kernel/vmlinux-arm26-xip.lds.in | 1 + arch/arm26/kernel/vmlinux-arm26.lds.in | 1 + arch/cris/arch-v10/kernel/process.c | 3 +-- arch/cris/arch-v10/vmlinux.lds.S | 1 + arch/cris/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/process.c | 3 --- arch/h8300/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/vmlinux.lds.S | 1 + arch/i386/kernel/process.c | 2 -- arch/i386/kernel/semaphore.c | 17 +++++++------- arch/i386/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/process.c | 2 -- arch/ia64/kernel/semaphore.c | 7 +++--- arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/process.c | 5 ---- arch/m68k/kernel/semaphore.c | 5 ++-- arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/m68knommu/kernel/process.c | 5 ---- arch/m68knommu/kernel/semaphore.c | 5 ++-- arch/m68knommu/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/process.c | 2 -- arch/mips/kernel/semaphore.c | 5 ++-- arch/mips/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/semaphore.c | 5 ++-- arch/parisc/kernel/vmlinux.lds.S | 1 + arch/ppc/kernel/process.c | 2 -- arch/ppc/kernel/semaphore.c | 5 ++-- arch/ppc/kernel/vmlinux.lds.S | 1 + arch/ppc64/kernel/process.c | 2 -- arch/ppc64/kernel/semaphore.c | 5 ++-- arch/ppc64/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/process.c | 2 -- arch/s390/kernel/semaphore.c | 5 ++-- arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/process.c | 4 +--- arch/sh/kernel/semaphore.c | 5 ++-- arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/process.c | 4 +--- arch/sparc/kernel/semaphore.c | 5 ++-- arch/sparc/kernel/vmlinux.lds.S | 1 + arch/sparc/lib/rwsem.S | 3 ++- arch/sparc64/kernel/process.c | 4 +--- arch/sparc64/kernel/semaphore.c | 9 ++++---- arch/sparc64/kernel/vmlinux.lds.S | 1 + arch/sparc64/lib/rwsem.c | 5 ++-- arch/v850/kernel/process.c | 3 --- arch/v850/kernel/semaphore.c | 5 ++-- arch/v850/kernel/vmlinux.lds.S | 1 + arch/x86_64/kernel/process.c | 2 -- arch/x86_64/kernel/semaphore.c | 5 ++-- arch/x86_64/kernel/vmlinux.lds.S | 1 + arch/x86_64/lib/thunk.S | 3 ++- include/asm-generic/vmlinux.lds.h | 5 ++++ include/linux/init.h | 2 ++ include/linux/sched.h | 2 ++ kernel/sched.c | 37 ++++++++++++++++-------------- kernel/timer.c | 4 ++-- lib/rwsem.c | 5 ++-- 67 files changed, 137 insertions(+), 124 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e427bae12ffe..297e4b48bfe2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -513,8 +513,6 @@ thread_saved_pc(task_t *t) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c index b52a0df303fe..4d60a0ccd6f7 100644 --- a/arch/alpha/kernel/semaphore.c +++ b/arch/alpha/kernel/semaphore.c @@ -7,6 +7,7 @@ #include #include +#include /* * This is basically the PPC semaphore scheme ported to use @@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr) * Either form may be used in conjunction with "up()". */ -void +void __sched __down_failed(struct semaphore *sem) { struct task_struct *tsk = current; @@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem) #endif } -int +int __sched __down_failed_interruptible(struct semaphore *sem) { struct task_struct *tsk = current; @@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem) wake_up(&sem->wait); } -void +void __sched down(struct semaphore *sem) { #if WAITQUEUE_DEBUG @@ -173,7 +174,7 @@ down(struct semaphore *sem) __down(sem); } -int +int __sched down_interruptible(struct semaphore *sem) { #if WAITQUEUE_DEBUG diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 7afd00d5d46b..d159b8f0d022 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -17,6 +17,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } :kernel diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 863c4076daad..8423921e821a 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c index a50902e8bec7..da39eb3dca31 100644 --- a/arch/arm/kernel/semaphore.c +++ b/arch/arm/kernel/semaphore.c @@ -13,6 +13,7 @@ */ #include #include +#include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -87,7 +88,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 56af3401b34d..a5db0ddca6a4 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -73,6 +73,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c index 09a2f52ad8a8..ce23571617a1 100644 --- a/arch/arm26/kernel/process.c +++ b/arch/arm26/kernel/process.c @@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c index e7964ce1d0d9..60591a738592 100644 --- a/arch/arm26/kernel/semaphore.c +++ b/arch/arm26/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in index 602a77c022d7..61eedf0bc42f 100644 --- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in @@ -66,6 +66,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in index 8782fe36f0a8..2393f3805a49 100644 --- a/arch/arm26/kernel/vmlinux-arm26.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26.lds.in @@ -67,6 +67,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 62e3a4fbf33a..c785b54e6cbd 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_ETRAX_GPIO void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */ @@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp, * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S index b2c27e147f29..6b73a2c0dad8 100644 --- a/arch/cris/arch-v10/vmlinux.lds.S +++ b/arch/cris/arch-v10/vmlinux.lds.S @@ -25,6 +25,7 @@ SECTIONS __stext = .; .text : { *(.text) + SCHED_TEXT *(.fixup) *(.text.__*) } diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c index d62b355e1706..b884263d3cd4 100644 --- a/arch/cris/kernel/semaphore.c +++ b/arch/cris/kernel/semaphore.c @@ -4,6 +4,7 @@ */ #include +#include #include /* @@ -94,7 +95,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -104,7 +105,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index bd6ccd542399..8640ea20dba0 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -264,8 +264,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/h8300/kernel/semaphore.c +++ b/arch/h8300/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 60787f07eb2b..3a643954a8fe 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -82,6 +82,7 @@ SECTIONS #endif __stext = . ; *(.text) + SCHED_TEXT . = ALIGN(0x4) ; *(.exit.text) *(.text.*) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3495f1aedf67..7fed9d3823ed 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -632,8 +632,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) #define top_esp (THREAD_SIZE - sizeof(unsigned long)) diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c index 5acd544f0cbd..073912cfcf44 100644 --- a/arch/i386/kernel/semaphore.c +++ b/arch/i386/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem) wake_up(&sem->wait); } -asmlinkage void __down(struct semaphore * sem) +asmlinkage void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -asmlinkage int __down_interruptible(struct semaphore * sem) +asmlinkage int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem) * value.. */ asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed\n" "__down_failed:\n\t" @@ -210,7 +211,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" @@ -231,7 +232,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_trylock\n" "__down_failed_trylock:\n\t" @@ -252,7 +253,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __up_wakeup\n" "__up_wakeup:\n\t" @@ -271,7 +272,7 @@ asm( */ #if defined(CONFIG_SMP) asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __write_lock_failed\n" "__write_lock_failed:\n\t" @@ -285,7 +286,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __read_lock_failed\n" "__read_lock_failed:\n\t" diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 3623d7e2934a..0253c586547b 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -16,6 +16,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a1d09d5c91c4..0d245cbcd1f6 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -660,8 +660,6 @@ get_wchan (struct task_struct *p) /* * These bracket the sleeping functions.. */ - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); # define first_sched ((unsigned long) scheduling_functions_start_here) # define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c index f3926a3c4d73..2724ef3fbae2 100644 --- a/arch/ia64/kernel/semaphore.c +++ b/arch/ia64/kernel/semaphore.c @@ -24,6 +24,7 @@ * where we want to avoid any extra jumps and calls. */ #include +#include #include #include @@ -44,8 +45,7 @@ __up (struct semaphore *sem) wake_up(&sem->wait); } -void -__down (struct semaphore *sem) +void __sched __down (struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,8 +82,7 @@ __down (struct semaphore *sem) tsk->state = TASK_RUNNING; } -int -__down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index e5589e49d9da..5c45718a9c82 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -41,6 +41,7 @@ SECTIONS { *(.text.ivt) *(.text) + SCHED_TEXT *(.gnu.linkonce.t*) } .text2 : AT(ADDR(.text2) - LOAD_OFFSET) diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 8d72a5c5b0c7..fc2c753c332b 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ if (sw->retpc > (unsigned long)scheduling_functions_start_here && @@ -387,8 +385,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/m68k/kernel/semaphore.c +++ b/arch/m68k/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index bd41fc992169..6dc62684c7b9 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -12,6 +12,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 2e81cde14987..f293e567192c 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -13,6 +13,7 @@ SECTIONS .text : { *(.head) *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index c8b87371641a..896d596a1bd8 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -406,8 +406,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; @@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p) */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c index 33d704fcf883..c083f4772add 100644 --- a/arch/m68knommu/kernel/semaphore.c +++ b/arch/m68knommu/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -96,7 +97,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -107,7 +108,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 1ab8a31ef964..a362870b6e4e 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -191,6 +191,7 @@ SECTIONS { .text : { _stext = . ; *(.text) + SCHED_TEXT *(.text.lock) . = ALIGN(16); /* Exception table */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index f8ba26770bf4..f4ab9c66b27f 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c index 11b937f20604..51c3e772c029 100644 --- a/arch/mips/kernel/semaphore.c +++ b/arch/mips/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifdef CONFIG_CPU_HAS_LLDSCD @@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem) * Either form may be used in conjunction with "up()". */ -void __down_failed(struct semaphore * sem) +void __sched __down_failed(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; @@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem, #endif /* !CONFIG_CPU_HAS_LLDSCD */ -int __down_failed_interruptible(struct semaphore * sem) +int __sched __down_failed_interruptible(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index b72639f8db65..098cfaa23c0e 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } =0 diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c index ffb4851451fc..ee806bcc3726 100644 --- a/arch/parisc/kernel/semaphore.c +++ b/arch/parisc/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include #include +#include /* * Semaphores are complex as we wish to avoid using two variables. @@ -58,7 +59,7 @@ void __up(struct semaphore *sem) sem->count += (sem->count < 0) ? 1 : - 1; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_HEAD @@ -74,7 +75,7 @@ void __down(struct semaphore * sem) UPDATE_COUNT } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DOWN_HEAD diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 14d0882a19d2..e5d5aeef96e5 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text ALIGN(16) : { *(.text*) + SCHED_TEXT *(.PARISC.unwind) *(.fixup) *(.lock.text) /* out-of-line lock text */ diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c index ada32baeda19..3363a030e00f 100644 --- a/arch/ppc/kernel/process.c +++ b/arch/ppc/kernel/process.c @@ -661,8 +661,6 @@ void __init ll_puts(const char *s) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c index 7bf51fba5c14..2fe429b27c14 100644 --- a/arch/ppc/kernel/semaphore.c +++ b/arch/ppc/kernel/semaphore.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -69,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index 81b95d449a22..b710d55c5b08 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS .text : { *(.text) + SCHED_TEXT *(.fixup) *(.got1) __got2_start = .; diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index cec7225a6ac1..f74b14d7e58e 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched (*(unsigned long *)scheduling_functions_start_here) #define last_sched (*(unsigned long *)scheduling_functions_end_here) diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c index c977029e2465..d723632d59f3 100644 --- a/arch/ppc64/kernel/semaphore.c +++ b/arch/ppc64/kernel/semaphore.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -70,7 +71,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S index a8531b1f9ef2..1d9b61143aaa 100644 --- a/arch/ppc64/kernel/vmlinux.lds.S +++ b/arch/ppc64/kernel/vmlinux.lds.S @@ -13,6 +13,7 @@ SECTIONS /* Read-only sections, merged into text segment: */ .text : { *(.text .text.*) + SCHED_TEXT *(.fixup) . = ALIGN(4096); _etext = .; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3676307d1d8a..050585ab5d2a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c index 8203f5e0228d..8dfb690c159f 100644 --- a/arch/s390/kernel/semaphore.c +++ b/arch/s390/kernel/semaphore.c @@ -11,6 +11,7 @@ */ #include #include +#include #include @@ -60,7 +61,7 @@ void __up(struct semaphore *sem) * count > 0: decrement count, wake up queue and exit. * count <= 0: set count to -1, go to sleep. */ -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +83,7 @@ void __down(struct semaphore * sem) * count > 0: wake up queue and exit. * count <= 0: set count to 0, wake up queue and exit. */ -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index c9ca7a8e93b3..b4534b2867c3 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -23,6 +23,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0700 diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 773006661b50..7d45ea0acd09 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -464,8 +464,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p) * The same comment as on the Alpha applies here, too ... */ pc = thread_saved_pc(p); - if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) { + if (pc >= first_sched && pc < last_sched) { schedule_frame = ((unsigned long *)(long)p->thread.sp)[1]; return (unsigned long)((unsigned long *)schedule_frame)[1]; } diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c index 0943ad666a67..a3c24dcbf01d 100644 --- a/arch/sh/kernel/semaphore.c +++ b/arch/sh/kernel/semaphore.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -103,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -113,7 +114,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 2cc86534c130..da0f5d728b3e 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -22,6 +22,7 @@ SECTIONS } = 0 .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0009 diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index beae70a970e4..70261b211997 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) return retval; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c index 5a8f3d176a8f..77e63b92ca30 100644 --- a/arch/sparc/kernel/semaphore.c +++ b/arch/sparc/kernel/semaphore.c @@ -4,6 +4,7 @@ #include #include +#include #include @@ -45,7 +46,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -78,7 +79,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 0862360d865d..8d4bbfaf304c 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -12,6 +12,7 @@ SECTIONS .text 0xf0004000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S index 98b757cb67c6..e7578dc600b8 100644 --- a/arch/sparc/lib/rwsem.S +++ b/arch/sparc/lib/rwsem.S @@ -8,7 +8,7 @@ #include #include - .text + .section .sched.text .align 4 .globl ___down_read @@ -113,6 +113,7 @@ ___down_write: ba 2b restore %l5, %g0, %g5 + .text .globl ___up_read ___up_read: rd %psr, %g3 diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 1be2b97e4672..0caf962e8155 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -823,9 +824,6 @@ out: return error; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index a9e66d666ceb..9ddfcb9a1900 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -8,6 +8,7 @@ #include #include +#include /* * Atomically update sem->count. @@ -90,7 +91,7 @@ void up(struct semaphore *sem) : "g5", "g7", "memory", "cc"); } -static void __down(struct semaphore * sem) +static void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -108,7 +109,7 @@ static void __down(struct semaphore * sem) wake_up(&sem->wait); } -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { might_sleep(); /* This atomically does: @@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem) return ret; } -static int __down_interruptible(struct semaphore * sem) +static int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem) return retval; } -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { int ret = 0; diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S index ad95e88a3cbc..8faeee09fab2 100644 --- a/arch/sparc64/kernel/vmlinux.lds.S +++ b/arch/sparc64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS .text 0x0000000000404000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c index 8e1dfdda91fa..e19968dbc2d1 100644 --- a/arch/sparc64/lib/rwsem.c +++ b/arch/sparc64/lib/rwsem.c @@ -6,6 +6,7 @@ #include #include +#include #include extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); @@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *)); -void __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_read\n" @@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem) } EXPORT_SYMBOL(__down_read_trylock); -void __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_write\n\t" diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c index 5c29ae51a303..977d75772d81 100644 --- a/arch/v850/kernel/process.c +++ b/arch/v850/kernel/process.c @@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here (void); -extern void scheduling_functions_end_here (void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c index b78d714384db..2d20886863d8 100644 --- a/arch/v850/kernel/semaphore.c +++ b/arch/v850/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S index 028c224fa66a..07ab0f292d1c 100644 --- a/arch/v850/kernel/vmlinux.lds.S +++ b/arch/v850/kernel/vmlinux.lds.S @@ -64,6 +64,7 @@ #define TEXT_CONTENTS \ __stext = . ; \ *(.text) \ + SCHED_TEXT *(.exit.text) /* 2.5 convention */ \ *(.text.exit) /* 2.4 convention */ \ *(.text.lock) \ diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 7b2414765ca3..d1d9471581a8 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c index 5e517814dd07..2bcd4a7ec38d 100644 --- a/arch/x86_64/kernel/semaphore.c +++ b/arch/x86_64/kernel/semaphore.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7b9e1beb360e..c612e4d213a1 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 876cb937f9f1..acc1e2ca7ed7 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -35,6 +35,7 @@ .endm + .section .sched.text #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed @@ -65,7 +66,7 @@ restore_norax: #ifdef CONFIG_SMP /* Support for read/write spinlocks. */ - + .text /* rax: pointer to rwlock_t */ ENTRY(__write_lock_failed) lock diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 59c2b950e8b8..a4b6c768cf49 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -51,3 +51,8 @@ *(.security_initcall.init) \ __security_initcall_end = .; \ } + +#define SCHED_TEXT \ + __scheduling_functions_start_here = .; \ + *(.sched.text) \ + __scheduling_functions_end_here = .; diff --git a/include/linux/init.h b/include/linux/init.h index 45069e275b3d..c6842477243c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -46,6 +46,8 @@ #define __exitdata __attribute__ ((__section__(".exit.data"))) #define __exit_call __attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) +#define __sched __attribute__((__section__(".sched.text"))) + #ifdef MODULE #define __exit __attribute__ ((__section__(".exit.text"))) #else diff --git a/include/linux/sched.h b/include/linux/sched.h index f5fa0c07a7f8..054b3c0d5962 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user, unsigned long system, int cpu); extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; +extern const unsigned long scheduling_functions_start_here; +extern const unsigned long scheduling_functions_end_here; #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/kernel/sched.c b/kernel/sched.c index 9e19d4c0d4a9..b42029abe679 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +extern unsigned long __scheduling_functions_start_here; +extern unsigned long __scheduling_functions_end_here; +const unsigned long scheduling_functions_start_here = + (unsigned long)&__scheduling_functions_start_here; +const unsigned long scheduling_functions_end_here = + (unsigned long)&__scheduling_functions_end_here; + /* * Default context-switch locking: */ @@ -1587,12 +1594,10 @@ out: rebalance_tick(rq, 0); } -void scheduling_functions_start_here(void) { } - /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; @@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule); * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void preempt_schedule(void) +asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } -void fastcall wait_for_completion(struct completion *x) +void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion); __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); -void fastcall interruptible_sleep_on(wait_queue_head_t *q) +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall sleep_on(wait_queue_head_t *q) +void fastcall __sched sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(sleep_on); -long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); -void scheduling_functions_end_here(void) { } - void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __cond_resched(void) +void __sched __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); @@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched); * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void yield(void) +void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); @@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void io_schedule(void) +void __sched io_schedule(void) { struct runqueue *rq = this_rq(); @@ -2464,7 +2467,7 @@ void io_schedule(void) EXPORT_SYMBOL(io_schedule); -long io_schedule_timeout(long timeout) +long __sched io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; @@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep); * * Called inside preempt_disable(). */ -void __preempt_spin_lock(spinlock_t *lock) +void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); @@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock) EXPORT_SYMBOL(__preempt_spin_lock); -void __preempt_write_lock(rwlock_t *lock) +void __sched __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); diff --git a/kernel/timer.c b/kernel/timer.c index f53e0749b0d2..cbcb5522866d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -fastcall signed long schedule_timeout(signed long timeout) +fastcall signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; @@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long nanosleep_restart(struct restart_block *restart) +static long __sched nanosleep_restart(struct restart_block *restart) { unsigned long expire = restart->arg0, now = jiffies; struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; diff --git a/lib/rwsem.c b/lib/rwsem.c index 95469d7fb796..85dcae7e9337 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -5,6 +5,7 @@ */ #include #include +#include #include struct rwsem_waiter { @@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore /* * wait for the read lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; @@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; -- cgit v1.2.3 From ed678f13aec6fdd86c952b05200f741aa473dba8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:51:16 -0700 Subject: [PATCH] Quota locking fixes From: Jan Kara Change locking rules in quota code to fix lock ordering especially wrt journal lock. Also some unnecessary spinlocking is removed. The locking changes are mainly: dqptr_sem, dqio_sem are acquired only when transaction is already started, dqonoff_sem before a transaction is started. This change requires some callbacks to ext3 (also implemented in this patch) to start transaction before the locks are acquired. --- fs/Kconfig | 6 +- fs/dquot.c | 204 ++++++++++++++++++++++++++--------------------- fs/ext3/super.c | 51 +++++++++--- fs/inode.c | 16 ++-- include/linux/quotaops.h | 15 +--- 5 files changed, 165 insertions(+), 127 deletions(-) (limited to 'include/linux') diff --git a/fs/Kconfig b/fs/Kconfig index ef8e47fb1c39..c748a2ce35ee 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -417,7 +417,7 @@ config QFMT_V1 tristate "Old quota format support" depends on QUOTA help - This quota format was (is) used by kernels earlier than 2.4.??. If + This quota format was (is) used by kernels earlier than 2.4.22. If you have quota working and you don't want to convert to new quota format say Y here. @@ -426,8 +426,8 @@ config QFMT_V2 depends on QUOTA help This quota format allows using quotas with 32-bit UIDs/GIDs. If you - need this functionality say Y here. Note that you will need latest - quota utilities for new quota format with this kernel. + need this functionality say Y here. Note that you will need recent + quota utilities (>= 3.01) for new quota format with this kernel. config QUOTACTL bool diff --git a/fs/dquot.c b/fs/dquot.c index b7b9b5c44277..e6b39e66207a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -85,12 +85,31 @@ * and quota formats and also dqstats structure containing statistics about the * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. - * Note that we don't have to do the locking of i_blocks and i_bytes when the - * quota is disabled - i_sem should serialize the access. dq_data_lock should - * be always grabbed before dq_list_lock. + * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly + * in inode_add_bytes() and inode_sub_bytes(). + * + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock + * + * Any operation working on dquots via inode pointers must hold dqptr_sem. If + * operation is just reading pointers from inode (or not using them at all) the + * read lock is enough. If pointers are altered function must hold write lock. + * If operation is holding reference to dquot in other way (e.g. quotactl ops) + * it must be guarded by dqonoff_sem. + * This locking assures that: + * a) update/access to dquot pointers in inode is serialized + * b) everyone is guarded against invalidate_dquots() + * + * Each dquot has its dq_lock semaphore. Locked dquots might not be referenced + * from inodes (dquot_alloc_space() and such don't check the dq_lock). + * Currently dquot is locked only when it is being read to memory on the first + * dqget(). Write operations on dquots don't hold dq_lock as they copy data + * under dq_data_lock spinlock to internal buffers before writing. + * + * Lock ordering (including journal_lock) is following: + * dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem */ spinlock_t dq_list_lock = SPIN_LOCK_UNLOCKED; spinlock_t dq_data_lock = SPIN_LOCK_UNLOCKED; @@ -169,23 +188,6 @@ static void put_quota_format(struct quota_format_type *fmt) * mechanism to locate a specific dquot. */ -/* - * Note that any operation which operates on dquot data (ie. dq_dqb) must - * hold dq_data_lock. - * - * Any operation working with dquots must hold dqptr_sem. If operation is - * just reading pointers from inodes than read lock is enough. If pointers - * are altered function must hold write lock. - * - * Locked dquots might not be referenced in inodes. Currently dquot it locked - * only once in its existence - when it's being read to memory on first dqget() - * and at that time it can't be referenced from inode. Write operations on - * dquots don't hold dquot lock as they copy data to internal buffers before - * writing anyway and copying as well as any data update should be atomic. Also - * nobody can change used entries in dquot structure as this is done only when - * quota is destroyed and invalidate_dquots() is called only when dq_count == 0. - */ - static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static struct list_head dquot_hash[NR_DQHASH]; @@ -286,9 +288,9 @@ static int commit_dqblk(struct dquot *dquot) } /* Invalidate all dquots on the list. Note that this function is called after - * quota is disabled so no new quota might be created. Because we hold dqptr_sem - * for writing and pointers were already removed from inodes we actually know that - * no quota for this sb+type should be held. */ + * quota is disabled so no new quota might be created. Because we hold + * dqonoff_sem and pointers were already removed from inodes we actually know + * that no quota for this sb+type should be held. */ static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot; @@ -302,12 +304,11 @@ static void invalidate_dquots(struct super_block *sb, int type) continue; if (dquot->dq_type != type) continue; -#ifdef __DQUOT_PARANOIA - /* There should be no users of quota - we hold dqptr_sem for writing */ +#ifdef __DQUOT_PARANOIA if (atomic_read(&dquot->dq_count)) BUG(); #endif - /* Quota now have no users and it has been written on last dqput() */ + /* Quota now has no users and it has been written on last dqput() */ remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); @@ -323,7 +324,7 @@ static int vfs_quota_sync(struct super_block *sb, int type) struct quota_info *dqopt = sb_dqopt(sb); int cnt; - down_read(&dqopt->dqptr_sem); + down(&dqopt->dqonoff_sem); restart: /* At this point any dirty dquot will definitely be written so we can clear dirty flag from info */ @@ -359,7 +360,7 @@ restart: spin_lock(&dq_list_lock); dqstats.syncs++; spin_unlock(&dq_list_lock); - up_read(&dqopt->dqptr_sem); + up(&dqopt->dqonoff_sem); return 0; } @@ -402,7 +403,7 @@ static int shrink_dqcache_memory(int nr, unsigned int gfp_mask) /* * Put reference to dquot * NOTE: If you change this function please check whether dqput_blocks() works right... - * MUST be called with dqptr_sem held + * MUST be called with either dqptr_sem or dqonoff_sem held */ static void dqput(struct dquot *dquot) { @@ -467,7 +468,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) /* * Get reference to dquot - * MUST be called with dqptr_sem held + * MUST be called with either dqptr_sem or dqonoff_sem held */ static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { @@ -528,7 +529,7 @@ static int dqinit_needed(struct inode *inode, int type) return 0; } -/* This routine is guarded by dqptr_sem semaphore */ +/* This routine is guarded by dqonoff_sem semaphore */ static void add_dquot_ref(struct super_block *sb, int type) { struct list_head *p; @@ -594,7 +595,7 @@ put_it: /* Free list of dquots - called from inode.c */ /* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */ -void put_dquot_list(struct list_head *tofree_head) +static void put_dquot_list(struct list_head *tofree_head) { struct list_head *act_head; struct dquot *dquot; @@ -609,6 +610,20 @@ void put_dquot_list(struct list_head *tofree_head) } } +/* Function in inode.c - remove pointers to dquots in icache */ +extern void remove_dquot_ref(struct super_block *, int, struct list_head *); + +/* Gather all references from inodes and drop them */ +static void drop_dquot_ref(struct super_block *sb, int type) +{ + LIST_HEAD(tofree_head); + + down_write(&sb_dqopt(sb)->dqptr_sem); + remove_dquot_ref(sb, type, &tofree_head); + up_write(&sb_dqopt(sb)->dqptr_sem); + put_dquot_list(&tofree_head); +} + static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) { dquot->dq_dqb.dqb_curinodes += number; @@ -804,6 +819,9 @@ void dquot_initialize(struct inode *inode, int type) unsigned int id = 0; int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return; down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) { @@ -831,50 +849,23 @@ void dquot_initialize(struct inode *inode, int type) up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); } -/* - * Remove references to quota from inode - * This function needs dqptr_sem for writing - */ -static void dquot_drop_iupdate(struct inode *inode, struct dquot **to_drop) -{ - int cnt; - - inode->i_flags &= ~S_QUOTA; - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - to_drop[cnt] = inode->i_dquot[cnt]; - inode->i_dquot[cnt] = NODQUOT; - } -} - /* * Release all quotas referenced by inode + * Transaction must be started at an entry */ void dquot_drop(struct inode *inode) { - struct dquot *to_drop[MAXQUOTAS]; int cnt; down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - dquot_drop_iupdate(inode, to_drop); + inode->i_flags &= ~S_QUOTA; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt] != NODQUOT) { + dqput(inode->i_dquot[cnt]); + inode->i_dquot[cnt] = NODQUOT; + } + } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (to_drop[cnt] != NODQUOT) - dqput(to_drop[cnt]); -} - -/* - * Release all quotas referenced by inode. - * This function assumes dqptr_sem for writing - */ -void dquot_drop_nolock(struct inode *inode) -{ - struct dquot *to_drop[MAXQUOTAS]; - int cnt; - - dquot_drop_iupdate(inode, to_drop); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (to_drop[cnt] != NODQUOT) - dqput(to_drop[cnt]); } /* @@ -885,11 +876,17 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) { + inode_add_bytes(inode, number); + return QUOTA_OK; + } for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) goto add_bytes; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -921,9 +918,13 @@ int dquot_alloc_inode(const struct inode *inode, unsigned long number) int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return QUOTA_OK; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -956,8 +957,14 @@ void dquot_free_space(struct inode *inode, qsize_t number) { unsigned int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) { + inode_sub_bytes(inode, number); + return; + } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) goto sub_bytes; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -978,7 +985,11 @@ void dquot_free_inode(const struct inode *inode, unsigned long number) { unsigned int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return; @@ -1007,14 +1018,20 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return QUOTA_OK; /* Clear the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_to[cnt] = transfer_from[cnt] = NODQUOT; warntype[cnt] = NOWARN; } + down(&sb_dqopt(inode->i_sb)->dqonoff_sem); down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up(&sb_dqopt(inode->i_sb)->dqonoff_sem); return QUOTA_OK; } /* First build the transfer_to list - here we can block on reading of dquots... */ @@ -1065,6 +1082,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) ret = QUOTA_OK; warn_put_all: spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(transfer_to, warntype); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1073,7 +1091,7 @@ warn_put_all: if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT) dqput(transfer_to[cnt]); } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up(&sb_dqopt(inode->i_sb)->dqonoff_sem); return ret; } @@ -1121,9 +1139,6 @@ static inline void reset_enable_flags(struct quota_info *dqopt, int type) } } -/* Function in inode.c - remove pointers to dquots in icache */ -extern void remove_dquot_ref(struct super_block *, int); - /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ @@ -1137,7 +1152,6 @@ int vfs_quota_off(struct super_block *sb, int type) /* We need to serialize quota_off() for device */ down(&dqopt->dqonoff_sem); - down_write(&dqopt->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -1146,7 +1160,7 @@ int vfs_quota_off(struct super_block *sb, int type) reset_enable_flags(dqopt, cnt); /* Note: these are blocking operations */ - remove_dquot_ref(sb, cnt); + drop_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); /* * Now all dquots should be invalidated, all writes done so we should be only @@ -1168,7 +1182,6 @@ int vfs_quota_off(struct super_block *sb, int type) dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } - up_write(&dqopt->dqptr_sem); up(&dqopt->dqonoff_sem); out: return 0; @@ -1180,7 +1193,8 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) struct inode *inode; struct quota_info *dqopt = sb_dqopt(sb); struct quota_format_type *fmt = find_quota_format(format_id); - int error; + int error, cnt; + struct dquot *to_drop[MAXQUOTAS]; unsigned int oldflags; if (!fmt) @@ -1202,7 +1216,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) goto out_f; down(&dqopt->dqonoff_sem); - down_write(&dqopt->dqptr_sem); if (sb_has_quota_enabled(sb, type)) { error = -EBUSY; goto out_lock; @@ -1213,8 +1226,20 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_file_init; /* We don't want quota and atime on quota files (deadlocks possible) */ - dquot_drop_nolock(inode); + down_write(&dqopt->dqptr_sem); inode->i_flags |= S_NOQUOTA | S_NOATIME; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + to_drop[cnt] = inode->i_dquot[cnt]; + inode->i_dquot[cnt] = NODQUOT; + } + inode->i_flags &= ~S_QUOTA; + up_write(&dqopt->dqptr_sem); + /* We must put dquots outside of dqptr_sem because we may need to + * start transaction for write */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (to_drop[cnt]) + dqput(to_drop[cnt]); + } dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; @@ -1225,7 +1250,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) } up(&dqopt->dqio_sem); set_enable_flags(dqopt, type); - up_write(&dqopt->dqptr_sem); add_dquot_ref(sb, type); up(&dqopt->dqonoff_sem); @@ -1268,14 +1292,14 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!(dquot = dqget(sb, id, type))) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } do_get_dqblk(dquot, di); dqput(dquot); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1337,14 +1361,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!(dquot = dqget(sb, id, type))) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } do_set_dqblk(dquot, di); dqput(dquot); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1353,9 +1377,9 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!sb_has_quota_enabled(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } mi = sb_dqopt(sb)->info + type; @@ -1365,7 +1389,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) ii->dqi_flags = mi->dqi_flags & DQF_MASK; ii->dqi_valid = IIF_ALL; spin_unlock(&dq_data_lock); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1374,9 +1398,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!sb_has_quota_enabled(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } mi = sb_dqopt(sb)->info + type; @@ -1389,7 +1413,7 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK); mark_info_dirty(mi); spin_unlock(&dq_data_lock); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index baf30c5045ec..e6ae6c9e0f46 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1958,6 +1958,18 @@ int ext3_statfs (struct super_block * sb, struct kstatfs * buf) #define EXT3_V0_QFMT_BLOCKS 27 static int (*old_write_dquot)(struct dquot *dquot); +static void (*old_drop_dquot)(struct inode *inode); + +static int fmt_to_blocks(int fmt) +{ + switch (fmt) { + case QFMT_VFS_OLD: + return EXT3_OLD_QFMT_BLOCKS; + case QFMT_VFS_V0: + return EXT3_V0_QFMT_BLOCKS; + } + return EXT3_MAX_TRANS_DATA; +} static int ext3_write_dquot(struct dquot *dquot) { @@ -1965,20 +1977,11 @@ static int ext3_write_dquot(struct dquot *dquot) int ret; int err; handle_t *handle; - struct quota_info *dqops = sb_dqopt(dquot->dq_sb); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); struct inode *qinode; - switch (dqops->info[dquot->dq_type].dqi_format->qf_fmt_id) { - case QFMT_VFS_OLD: - nblocks = EXT3_OLD_QFMT_BLOCKS; - break; - case QFMT_VFS_V0: - nblocks = EXT3_V0_QFMT_BLOCKS; - break; - default: - nblocks = EXT3_MAX_TRANS_DATA; - } - qinode = dqops->files[dquot->dq_type]->f_dentry->d_inode; + nblocks = fmt_to_blocks(dqopt->info[dquot->dq_type].dqi_format->qf_fmt_id); + qinode = dqopt->files[dquot->dq_type]->f_dentry->d_inode; handle = ext3_journal_start(qinode, nblocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1991,6 +1994,28 @@ static int ext3_write_dquot(struct dquot *dquot) out: return ret; } + +static void ext3_drop_dquot(struct inode *inode) +{ + int nblocks, type; + struct quota_info *dqopt = sb_dqopt(inode->i_sb); + handle_t *handle; + + for (type = 0; type < MAXQUOTAS; type++) { + if (sb_has_quota_enabled(inode->i_sb, type)) + break; + } + if (type < MAXQUOTAS) + nblocks = fmt_to_blocks(dqopt->info[type].dqi_format->qf_fmt_id); + else + nblocks = 0; /* No quota => no drop */ + handle = ext3_journal_start(inode, 2*nblocks); + if (IS_ERR(handle)) + return; + old_drop_dquot(inode); + ext3_journal_stop(handle); + return; +} #endif static struct super_block *ext3_get_sb(struct file_system_type *fs_type, @@ -2018,7 +2043,9 @@ static int __init init_ext3_fs(void) #ifdef CONFIG_QUOTA init_dquot_operations(&ext3_qops); old_write_dquot = ext3_qops.write_dquot; + old_drop_dquot = ext3_qops.drop; ext3_qops.write_dquot = ext3_write_dquot; + ext3_qops.drop = ext3_drop_dquot; #endif err = register_filesystem(&ext3_fs_type); if (err) diff --git a/fs/inode.c b/fs/inode.c index 01c5740aacdd..d367d4629f3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1216,15 +1216,13 @@ EXPORT_SYMBOL(inode_needs_sync); */ #ifdef CONFIG_QUOTA -/* Functions back in dquot.c */ -void put_dquot_list(struct list_head *); +/* Function back in dquot.c */ int remove_inode_dquot_ref(struct inode *, int, struct list_head *); -void remove_dquot_ref(struct super_block *sb, int type) +void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { struct inode *inode; struct list_head *act_head; - LIST_HEAD(tofree_head); if (!sb->dq_op) return; /* nothing to do */ @@ -1234,26 +1232,24 @@ void remove_dquot_ref(struct super_block *sb, int type) list_for_each(act_head, &inode_in_use) { inode = list_entry(act_head, struct inode, i_list); if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &inode_unused) { inode = list_entry(act_head, struct inode, i_list); if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &sb->s_dirty) { inode = list_entry(act_head, struct inode, i_list); if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &sb->s_io) { inode = list_entry(act_head, struct inode, i_list); if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } spin_unlock(&inode_lock); - - put_dquot_list(&tofree_head); } #endif diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 155c9a2af016..e5a9e6bed751 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -64,11 +64,8 @@ static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) return 1; } - else { - spin_lock(&dq_data_lock); + else inode_add_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } return 0; } @@ -87,11 +84,8 @@ static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) return 1; } - else { - spin_lock(&dq_data_lock); + else inode_add_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } return 0; } @@ -117,11 +111,8 @@ static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { if (sb_any_quota_enabled(inode->i_sb)) inode->i_sb->dq_op->free_space(inode, nr); - else { - spin_lock(&dq_data_lock); + else inode_sub_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } } static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) -- cgit v1.2.3 From 94b1c3ebf78bd58c2f45b78f2c24c7c939c34a9e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:52:32 -0700 Subject: [PATCH] knfsd: Remove name_lookup.h that noone is using anymore. From: NeilBrown --- include/linux/sunrpc/name_lookup.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 include/linux/sunrpc/name_lookup.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/name_lookup.h b/include/linux/sunrpc/name_lookup.h deleted file mode 100644 index 0c97ec324ada..000000000000 --- a/include/linux/sunrpc/name_lookup.h +++ /dev/null @@ -1,38 +0,0 @@ - -/* - * map between user/group name and id for a given 'client' - */ - -struct name_ent { - char name[20]; -}; -static inline int name_get_user(int uid, struct name_ent **namep) -{ - struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL); - if (n) sprintf(n->name, "%d",uid); - *namep = n; - return n ? 0 : -ENOMEM; -} -static inline int name_get_group(int uid, struct name_ent **namep) -{ - struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL); - if (n) sprintf(n->name, "%d",uid); - *namep = n; - return n ? 0 : -ENOMEM; -} -static inline int name_get_uid(char *name, int name_len, int *uidp) -{ - *uidp = simple_strtoul(name, NULL, 0); - return 0; -} - -static inline int name_get_gid(char *name, int name_len, int *gidp) -{ - *gidp = simple_strtoul(name, NULL, 0); - return 0; -} - -static inline void name_put(struct name_ent *ent) -{ - kfree(ent); -} -- cgit v1.2.3 From c02c0886973521cd77904d8f07aa98d99c63cb3b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:52:44 -0700 Subject: [PATCH] knfsd: Add server-side support for the nfsv4 mounted_on_fileid attribute. From: NeilBrown --- fs/nfsd/nfs4xdr.c | 11 +++++++++++ include/linux/nfs4.h | 1 + include/linux/nfsd/nfsd.h | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d19b1c6b7f45..8908bfc17184 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1588,7 +1588,18 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(stat.mtime.tv_sec); WRITE32(stat.mtime.tv_nsec); } + if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + struct dentry *mnt_pnt, *mnt_root; + if ((buflen -= 8) < 0) + goto out_resource; + mnt_root = exp->ex_mnt->mnt_root; + if (mnt_root->d_inode == dentry->d_inode) { + mnt_pnt = exp->ex_mnt->mnt_mountpoint; + WRITE64((u64) mnt_pnt->d_inode->i_ino); + } else + WRITE64((u64) stat.ino); + } *attrlenp = htonl((char *)p - (char *)attrlenp - 4); *countp = p - buffer; status = nfs_ok; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e8ea2239a213..520545881a52 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -274,6 +274,7 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_METADATA (1 << 20) #define FATTR4_WORD1_TIME_MODIFY (1 << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1 << 22) +#define FATTR4_WORD1_MOUNTED_ON_FILEID (1 << 23) #define NFSPROC4_NULL 0 #define NFSPROC4_COMPOUND 1 diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 6e6a66208308..418356558209 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -278,7 +278,7 @@ static inline int is_fsid(struct svc_fh *fh, struct knfsd_fh *reffh) | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ - | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ #define NFSD_WRITEONLY_ATTRS_WORD1 \ -- cgit v1.2.3 From 238a06e203a96960843faec4ec8f553f453082b9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:09 -0700 Subject: [PATCH] knfsd: Export a symbol needed by auth_gss From: NeilBrown From: "J. Bruce Fields" Without this compiling auth_gss as module fails. --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/sunrpc_syms.c | 1 + net/sunrpc/xdr.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 0ccaff2cdee2..2b334dc19962 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -145,6 +145,7 @@ extern void _copy_from_pages(char *, struct page **, size_t, size_t); extern void xdr_buf_from_iov(struct iovec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, int, int); extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, int); +extern int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len); /* * Helper structure for copying from an sk_buff. diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 9061f6498cc4..1ae41edbb0f1 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL(xdr_read_pages); EXPORT_SYMBOL(xdr_buf_from_iov); EXPORT_SYMBOL(xdr_buf_subsegment); EXPORT_SYMBOL(xdr_buf_read_netobj); +EXPORT_SYMBOL(read_bytes_from_xdr_buf); /* Debugging symbols */ #ifdef RPC_DEBUG diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index accfdd9284df..cae451e8db8d 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -799,7 +799,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } /* obj is assumed to point to allocated memory of size at least len: */ -static int +int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len) { struct xdr_buf subbuf; -- cgit v1.2.3 From 9abdc6608d7c5e3cb09c05bd6c726d04dc59ace4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:24 -0700 Subject: [PATCH] knfsd: Add data integrity to serve rside gss From: NeilBrown From: "J. Bruce Fields" rpcsec_gss supports three security levels: 1. authentication only: sign the header of each rpc request and response. 2. integrity: sign the header and body of each rpc request and response. 3. privacy: sign the header and encrypt the body of each rpc request and response. The first 2 are already supported on the client; this adds integrity support on the server. --- include/linux/sunrpc/svcauth_gss.h | 9 -- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 + net/sunrpc/auth_gss/svcauth_gss.c | 172 ++++++++++++++++++++++++++++++++++-- 3 files changed, 168 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index 73ca6ef2c4a8..a444c9edb9e9 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -22,14 +22,5 @@ int gss_svc_init(void); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); - -struct gss_svc_data { - /* decoded gss client cred: */ - struct rpc_gss_wire_cred clcred; - /* pointer to the beginning of the procedure-specific results, which - * may be encrypted/checksummed in svcauth_gss_release: */ - u32 *body_start; -}; - #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 42ceee1907d7..57c074a06970 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -236,6 +236,8 @@ static int __init init_kerberos_module(void) gss_register_triple(RPC_AUTH_GSS_KRB5I, gm, 0, RPC_GSS_SVC_INTEGRITY); if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5, "krb5")) printk("Failed to register %s with server!\n", "krb5"); + if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5I, "krb5i")) + printk("Failed to register %s with server!\n", "krb5i"); gss_mech_put(gm); return 0; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 9e13aaa2bc79..2277667d3397 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -670,6 +670,68 @@ out: return stat; } +static inline int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* It would be nice if this bit of code could be shared with the client. + * Obstacles: + * The client shouldn't malloc(), would have to pass in own memory. + * The server uses base of head iovec as read pointer, while the + * client uses separate pointer. */ +static int +unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + int stat = -EINVAL; + u32 integ_len, maj_stat; + struct xdr_netobj mic; + struct xdr_buf integ_buf; + + integ_len = ntohl(svc_getu32(&buf->head[0])); + if (integ_len & 3) + goto out; + if (integ_len > buf->len) + goto out; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + BUG(); + /* copy out mic... */ + if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + BUG(); + if (mic.len > RPC_MAX_AUTH_SIZE) + goto out; + mic.data = kmalloc(mic.len, GFP_KERNEL); + if (!mic.data) + goto out; + if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + goto out; + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + if (maj_stat != GSS_S_COMPLETE) + goto out; + if (ntohl(svc_getu32(&buf->head[0])) != seq) + goto out; + stat = 0; +out: + return stat; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* pointer to the beginning of the procedure-specific results, + * which may be encrypted/checksummed in svcauth_gss_release: */ + u32 *body_start; + struct rsc *rsci; +}; + /* * Accept an rpcsec packet. * If context establishment, punt to user space @@ -701,6 +763,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; + svcdata->body_start = 0; + svcdata->rsci = NULL; gc = &svcdata->clcred; /* start of rpc packet is 7 u32's back from here: @@ -754,9 +818,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) break; case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: - /* integrity and privacy unsupported: */ - if (gc->gc_svc != RPC_GSS_SVC_NONE) - goto auth_err; *authp = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(&gc->gc_ctx); if (!rsci) @@ -841,10 +902,28 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) *authp = rpcsec_gsserr_ctxproblem; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - /* For use when wrapping: */ - svcdata->body_start = resv->iov_base + 1; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); + *authp = rpc_autherr_badcred; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto auth_err; + svcdata->rsci = rsci; + cache_get(&rsci->h); + /* placeholders for length and seq. number: */ + svcdata->body_start = resv->iov_base + resv->iov_len; + svc_putu32(resv, 0); + svc_putu32(resv, 0); + break; + case RPC_GSS_SVC_PRIVACY: + /* currently unsupported */ + default: + goto auth_err; + } ret = SVC_OK; goto out; } @@ -867,14 +946,95 @@ out: static int svcauth_gss_release(struct svc_rqst *rqstp) { + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + struct iovec *resv; + u32 *p; + int integ_offset, integ_len; + int stat = -EINVAL; + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ + if (gsd->body_start == 0) + goto out; + /* normally not set till svc_send, but we need it here: */ + resbuf->len = resbuf->head[0].iov_len + + resbuf->page_len + resbuf->tail[0].iov_len; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + p = gsd->body_start; + gsd->body_start = 0; + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* don't wrap in failure case: */ + /* Note: counting on not getting here if call was not even + * accepted! */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + goto out; + } + p++; + integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; + integ_len = resbuf->len - integ_offset; + BUG_ON(integ_len % 4); + *p++ = htonl(integ_len); + *p++ = htonl(gc->gc_seq); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, + integ_len)) + BUG(); + if (resbuf->page_len == 0 + && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE + < PAGE_SIZE) { + BUG_ON(resbuf->tail[0].iov_len); + /* Use head for everything */ + resv = &resbuf->head[0]; + } else if (resbuf->tail[0].iov_base == NULL) { + /* copied from nfsd4_encode_read */ + svc_take_page(rqstp); + resbuf->tail[0].iov_base = page_address(rqstp + ->rq_respages[rqstp->rq_resused-1]); + rqstp->rq_restailpage = rqstp->rq_resused-1; + resbuf->tail[0].iov_len = 0; + resv = &resbuf->tail[0]; + } else { + resv = &resbuf->tail[0]; + } + mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; + if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + goto out_err; + svc_putu32(resv, htonl(mic.len)); + memset(mic.data + mic.len, 0, + round_up_to_quad(mic.len) - mic.len); + resv->iov_len += XDR_QUADLEN(mic.len) << 2; + /* not strictly required: */ + resbuf->len += XDR_QUADLEN(mic.len) << 2; + BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: + default: + goto out_err; + } + +out: + stat = 0; +out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; + if (gsd->rsci) + rsc_put(&gsd->rsci->h, &rsc_cache); + gsd->rsci = NULL; - return 0; + return stat; } static void -- cgit v1.2.3 From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:50 -0700 Subject: [PATCH] posix message queues: code move From: Manfred Spraul cleanup of sysv ipc as a preparation for posix message queues: - replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with static inline wrappers. Now the whole ipc/util.c file is only used if CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef. - remove the prototypes for copy_semundo and exit_sem from kernel/fork.c - they belong into a header file. - create a new msgutil.c with the helper functions for message queues. - cleanup the helper functions: run Lindent, add __user tags. --- include/linux/msg.h | 3 -- include/linux/sem.h | 17 ++++++- ipc/Makefile | 4 +- ipc/msg.c | 105 ------------------------------------------- ipc/msgutil.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ ipc/util.c | 19 -------- ipc/util.h | 10 +++++ kernel/fork.c | 4 +- 8 files changed, 155 insertions(+), 134 deletions(-) create mode 100644 ipc/msgutil.c (limited to 'include/linux') diff --git a/include/linux/msg.h b/include/linux/msg.h index b235e862a3dd..2c4c6aa643ff 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -74,9 +74,6 @@ struct msg_msg { /* the actual message follows immediately */ }; -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) - /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; diff --git a/include/linux/sem.h b/include/linux/sem.h index b337c509ac29..aaf45764a56e 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -134,7 +134,22 @@ struct sysv_sem { struct sem_undo_list *undo_list; }; -void exit_sem(struct task_struct *p); +#ifdef CONFIG_SYSVIPC + +extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern void exit_sem(struct task_struct *tsk); + +#else +static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} + +static inline void exit_sem(struct task_struct *tsk) +{ + return; +} +#endif #endif /* __KERNEL__ */ diff --git a/ipc/Makefile b/ipc/Makefile index ccc6c64c2493..6cd32a30f03f 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -2,7 +2,5 @@ # Makefile for the linux ipc. # -obj-y := util.o - obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o diff --git a/ipc/msg.c b/ipc/msg.c index 709ff71bf5c1..37e2d3bb17cb 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -51,11 +51,6 @@ struct msg_sender { struct task_struct* tsk; }; -struct msg_msgseg { - struct msg_msgseg* next; - /* the next part of the message follows immediately */ -}; - #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 @@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg) return msg_buildid(id,msq->q_perm.seq); } -static void free_msg(struct msg_msg* msg) -{ - struct msg_msgseg* seg; - - security_msg_msg_free(msg); - - seg = msg->next; - kfree(msg); - while(seg != NULL) { - struct msg_msgseg* tmp = seg->next; - kfree(seg); - seg = tmp; - } -} - -static struct msg_msg* load_msg(void* src, int len) -{ - struct msg_msg* msg; - struct msg_msgseg** pseg; - int err; - int alen; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - - msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL); - if(msg==NULL) - return ERR_PTR(-ENOMEM); - - msg->next = NULL; - msg->security = NULL; - - if (copy_from_user(msg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - - len -= alen; - src = ((char*)src)+alen; - pseg = &msg->next; - while(len > 0) { - struct msg_msgseg* seg; - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL); - if(seg==NULL) { - err=-ENOMEM; - goto out_err; - } - *pseg = seg; - seg->next = NULL; - if(copy_from_user (seg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - pseg = &seg->next; - len -= alen; - src = ((char*)src)+alen; - } - - err = security_msg_msg_alloc(msg); - if (err) - goto out_err; - - return msg; - -out_err: - free_msg(msg); - return ERR_PTR(err); -} - -static int store_msg(void* dest, struct msg_msg* msg, int len) -{ - int alen; - struct msg_msgseg *seg; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - if(copy_to_user (dest, msg+1, alen)) - return -1; - - len -= alen; - dest = ((char*)dest)+alen; - seg = msg->next; - while(len > 0) { - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - if(copy_to_user (dest, seg+1, alen)) - return -1; - len -= alen; - dest = ((char*)dest)+alen; - seg=seg->next; - } - return 0; -} - static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss) { mss->tsk=current; diff --git a/ipc/msgutil.c b/ipc/msgutil.c new file mode 100644 index 000000000000..e48d777de2a3 --- /dev/null +++ b/ipc/msgutil.c @@ -0,0 +1,127 @@ +/* + * linux/ipc/util.c + * Copyright (C) 1999, 2004 Manfred Spraul + * + * This file is released under GNU General Public Licence version 2 or + * (at your option) any later version. + * + * See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "util.h" + +struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +}; + +#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) + +struct msg_msg *load_msg(void __user *src, int len) +{ + struct msg_msg *msg; + struct msg_msgseg **pseg; + int err; + int alen; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + + msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + msg->next = NULL; + msg->security = NULL; + + if (copy_from_user(msg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + + len -= alen; + src = ((char *)src) + alen; + pseg = &msg->next; + while (len > 0) { + struct msg_msgseg *seg; + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, + GFP_KERNEL); + if (seg == NULL) { + err = -ENOMEM; + goto out_err; + } + *pseg = seg; + seg->next = NULL; + if (copy_from_user(seg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + pseg = &seg->next; + len -= alen; + src = ((char *)src) + alen; + } + + err = security_msg_msg_alloc(msg); + if (err) + goto out_err; + + return msg; + +out_err: + free_msg(msg); + return ERR_PTR(err); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + int alen; + struct msg_msgseg *seg; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + if (copy_to_user(dest, msg + 1, alen)) + return -1; + + len -= alen; + dest = ((char *)dest) + alen; + seg = msg->next; + while (len > 0) { + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + if (copy_to_user(dest, seg + 1, alen)) + return -1; + len -= alen; + dest = ((char *)dest) + alen; + seg = seg->next; + } + return 0; +} + +void free_msg(struct msg_msg *msg) +{ + struct msg_msgseg *seg; + + security_msg_msg_free(msg); + + seg = msg->next; + kfree(msg); + while (seg != NULL) { + struct msg_msgseg *tmp = seg->next; + kfree(seg); + seg = tmp; + } +} diff --git a/ipc/util.c b/ipc/util.c index 6d94883edae0..f74c5eef57d0 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -25,8 +25,6 @@ #include #include -#if defined(CONFIG_SYSVIPC) - #include "util.h" /** @@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd) } #endif /* __ia64__ */ - -#else -/* - * Dummy functions when SYSV IPC isn't configured - */ - -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) -{ - return 0; -} - -void exit_sem(struct task_struct *tsk) -{ - return; -} - -#endif /* CONFIG_SYSVIPC */ diff --git a/ipc/util.h b/ipc/util.h index 79c8fc901317..e6434942c097 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -4,6 +4,10 @@ * * ipc helper functions (c) 1999 Manfred Spraul */ + +#ifndef _IPC_UTIL_H +#define _IPC_UTIL_H + #define USHRT_MAX 0xffff #define SEQ_MULTIPLIER (IPCMNI) @@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); #else int ipc_parse_version (int *cmd); #endif + +extern void free_msg(struct msg_msg *msg); +extern struct msg_msg *load_msg(void __user *src, int len); +extern int store_msg(void __user *dest, struct msg_msg *msg, int len); + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index 3b17a249c50d..a1f20cabbdd3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -39,9 +40,6 @@ #include #include -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_sem(struct task_struct *tsk); - /* The idle threads do not count.. * Protected by write_lock_irq(&tasklist_lock) */ -- cgit v1.2.3 From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:03 -0700 Subject: [PATCH] posix message queues: syscall stubs From: Manfred Spraul Add -ENOSYS stubs for the posix message queue syscalls. The API is a direct mapping of the api from the unix spec, with two exceptions: - mq_close() doesn't exist. Message queue file descriptors can be closed with close(). - mq_notify(SIGEV_THREAD) cannot be implemented in the kernel. The kernel returns a pollable file descriptor . User space must poll (or read) this descriptor and call the notifier function if the file descriptor is signaled. --- arch/i386/kernel/entry.S | 9 +++++++++ include/asm-i386/unistd.h | 11 ++++++++++- include/linux/mqueue.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 9 +++++++++ kernel/sys.c | 6 ++++++ 5 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 include/linux/mqueue.h (limited to 'include/linux') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3024740ba84c..14e64d3ea25c 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -882,5 +882,14 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_ni_syscall /* sys_mbind */ + .long sys_ni_syscall /* 275 sys_get_mempolicy */ + .long sys_ni_syscall /* sys_set_mempolicy */ + .long sys_mq_open + .long sys_mq_unlink + .long sys_mq_timedsend + .long sys_mq_timedreceive /* 280 */ + .long sys_mq_notify + .long sys_mq_getsetattr syscall_table_size=(.-sys_call_table) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index a2d58a99491e..620a232084f3 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -279,8 +279,17 @@ #define __NR_utimes 271 #define __NR_fadvise64_64 272 #define __NR_vserver 273 +#define __NR_mbind 274 +#define __NR_get_mempolicy 275 +#define __NR_set_mempolicy 276 +#define __NR_mq_open 277 +#define __NR_mq_unlink (__NR_mq_open+1) +#define __NR_mq_timedsend (__NR_mq_open+2) +#define __NR_mq_timedreceive (__NR_mq_open+3) +#define __NR_mq_notify (__NR_mq_open+4) +#define __NR_mq_getsetattr (__NR_mq_open+5) -#define NR_syscalls 274 +#define NR_syscalls 283 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h new file mode 100644 index 000000000000..c0c5fcc89f0e --- /dev/null +++ b/include/linux/mqueue.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + It is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this software; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _LINUX_MQUEUE_H +#define _LINUX_MQUEUE_H + +#define MQ_PRIO_MAX 32768 + +typedef int mqd_t; + +struct mq_attr { + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +#define NOTIFY_NONE 0 +#define NOTIFY_WOKENUP 1 +#define NOTIFY_REMOVED 2 + +#endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aaf87aeacafb..7ee5f67abb5f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,6 +48,8 @@ struct timex; struct timezone; struct tms; struct utimbuf; +typedef int mqd_t; +struct mq_attr; #include #include @@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag); asmlinkage long sys_shmdt(char __user *shmaddr); asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); +asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); +asmlinkage long sys_mq_unlink(const char __user *name); +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); +asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); +asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); + asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn); asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/kernel/sys.c b/kernel/sys.c index bc498b12edcc..7d1bf5c57aca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -260,6 +260,12 @@ cond_syscall(sys_msgctl) cond_syscall(sys_shmget) cond_syscall(sys_shmdt) cond_syscall(sys_shmctl) +cond_syscall(sys_mq_open) +cond_syscall(sys_mq_unlink) +cond_syscall(sys_mq_timedsend) +cond_syscall(sys_mq_timedreceive) +cond_syscall(sys_mq_notify) +cond_syscall(sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From f3ca8d5dd5c23594bda07893ae374bed7981d473 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:54 -0700 Subject: [PATCH] posix message queue update From: Manfred Spraul My discussion with Ulrich had one result: - mq_setattr can accept implementation defined flags. Right now we have none, but we might add some later (e.g. switch to CLOCK_MONOTONIC for mq_timed{send,receive} or something similar). When we add flags, we might need the fields for additional information. And they don't hurt. Therefore add four __reserved fields to mq_attr. - fail mq_setattr if we get unknown flags - otherwise glibc can't detect if it's running on a future kernel that supports new features. - use memset to initialize the mq_attr structure - theoretically we could leak kernel memory. - Only set O_NONBLOCK in mq_attr, explicitely clear O_RDWR & friends. openposix uses getattr, attr |=O_NONBLOCK, setattr - a sane approach. Without clearing O_RDWR, this fails. I've retested all openposix conformance tests with the new patch - the two new FAILED tests check undefined behavior. Note that I won't have net access until Sunday - if the message queue patch breaks something important either ask Krzysztof or drop it. Ulrich had another good idea for SIGEV_THREAD, but I must think about it. It would mean less complexitiy in glibc, but more code in the kernel. I'm not yet convinced that it's overall better. --- include/linux/mqueue.h | 1 + ipc/mqueue.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index c0c5fcc89f0e..535fe4b2f14b 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -27,6 +27,7 @@ struct mq_attr { long mq_maxmsg; /* maximum number of messages */ long mq_msgsize; /* maximum message size */ long mq_curmsgs; /* number of messages currently queued */ + long __reserved[4]; /* ignored for input, zeroed for output */ }; #define NOTIFY_NONE 0 diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c9a3e652a026..b5f731781f56 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -121,7 +121,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode) INIT_LIST_HEAD(&info->e_wait_q[1].list); info->notify_owner = 0; info->qsize = 0; - info->attr.mq_curmsgs = 0; + memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = DFLT_MSGMAX; info->attr.mq_msgsize = DFLT_MSGSIZEMAX; info->messages = kmalloc(DFLT_MSGMAX * sizeof(struct msg_msg *), GFP_KERNEL); @@ -1082,6 +1082,8 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, if (u_mqstat != NULL) { if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr))) return -EFAULT; + if (mqstat.mq_flags & (~O_NONBLOCK)) + return -EINVAL; } ret = -EBADF; @@ -1097,7 +1099,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, spin_lock(&info->lock); omqstat = info->attr; - omqstat.mq_flags = filp->f_flags; + omqstat.mq_flags = filp->f_flags & O_NONBLOCK; if (u_mqstat) { if (mqstat.mq_flags & O_NONBLOCK) filp->f_flags |= O_NONBLOCK; -- cgit v1.2.3 From ed6dcf4a49c1098e0701762f6cc52b194cb7f661 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:19 -0700 Subject: [PATCH] split netlink_unicast From: Manfred Spraul The attached patch splits netlink_unicast into three steps: - netlink_getsock{bypid,byfilp}: lookup the destination socket. - netlink_attachskb: perform the nonblock checks, sleep if the socket queue is longer than the limit, etc. - netlink_sendskb: actually send the skb. jamal looked over it and didn't see a problem with the netlink change. The actual use from ipc/mqueue.c is still open (just send back whatever the C library passed to mq_notify, add an nlmsghdr or perhaps even make it a specialized netlink protocol), but the attached patch is independant from the the message queue change. (acked by davem) --- include/linux/netlink.h | 7 +++ net/netlink/af_netlink.c | 120 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 4e5ea27305a2..e5e15ddadab5 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -119,6 +119,13 @@ extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); extern int netlink_register_notifier(struct notifier_block *nb); extern int netlink_unregister_notifier(struct notifier_block *nb); +/* finegrained unicast helpers: */ +struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid); +struct sock *netlink_getsockbyfilp(struct file *filp); +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo); +void netlink_detachskb(struct sock *sk, struct sk_buff *skb); +int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol); + /* * skb should fit one page. This choice is good for headerless malloc. * diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 38c27b9bb70a..398cd03f2d7b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -415,38 +415,65 @@ static void netlink_overrun(struct sock *sk) } } -int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) { - struct sock *sk; - struct netlink_opt *nlk; - int len = skb->len; int protocol = ssk->sk_protocol; - long timeo; - DECLARE_WAITQUEUE(wait, current); - - timeo = sock_sndtimeo(ssk, nonblock); + struct sock *sock; + struct netlink_opt *nlk; -retry: - sk = netlink_lookup(protocol, pid); - if (sk == NULL) - goto no_dst; - nlk = nlk_sk(sk); + sock = netlink_lookup(protocol, pid); + if (!sock) + return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ - if (nlk->pid == 0 && !nlk->data_ready) - goto no_dst; + nlk = nlk_sk(sock); + if (nlk->pid == 0 && !nlk->data_ready) { + sock_put(sock); + return ERR_PTR(-ECONNREFUSED); + } + return sock; +} + +struct sock *netlink_getsockbyfilp(struct file *filp) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct socket *socket; + struct sock *sock; + + if (!inode->i_sock || !(socket = SOCKET_I(inode))) + return ERR_PTR(-ENOTSOCK); + + sock = socket->sk; + if (sock->sk_family != AF_NETLINK) + return ERR_PTR(-EINVAL); + + sock_hold(sock); + return sock; +} + +/* + * Attach a skb to a netlink socket. + * The caller must hold a reference to the destination socket. On error, the + * reference is dropped. The skb is not send to the destination, just all + * all error checks are performed and memory in the queue is reserved. + * Return values: + * < 0: error. skb freed, reference to sock dropped. + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) +{ + struct netlink_opt *nlk; + + nlk = nlk_sk(sk); #ifdef NL_EMULATE_DEV - if (nlk->handler) { - skb_orphan(skb); - len = nlk->handler(protocol, skb); - sock_put(sk); - return len; - } + if (nlk->handler) + return 0; #endif - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); if (!timeo) { if (!nlk->pid) netlink_overrun(sk); @@ -471,19 +498,60 @@ retry: kfree_skb(skb); return sock_intr_errno(timeo); } - goto retry; + return 1; } - skb_orphan(skb); skb_set_owner_r(skb, sk); + return 0; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) +{ + struct netlink_opt *nlk; + int len = skb->len; + + nlk = nlk_sk(sk); +#ifdef NL_EMULATE_DEV + if (nlk->handler) { + skb_orphan(skb); + len = nlk->handler(protocol, skb); + sock_put(sk); + return len; + } +#endif + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); sock_put(sk); return len; +} -no_dst: +void netlink_detachskb(struct sock *sk, struct sk_buff *skb) +{ kfree_skb(skb); - return -ECONNREFUSED; + sock_put(sk); +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +{ + struct sock *sk; + int err; + long timeo; + + timeo = sock_sndtimeo(ssk, nonblock); +retry: + sk = netlink_getsockbypid(ssk, pid); + if (IS_ERR(sk)) { + kfree_skb(skb); + return PTR_ERR(skb); + } + err = netlink_attachskb(sk, skb, nonblock, timeo); + if (err == 1) + goto retry; + if (err) + return err; + + return netlink_sendskb(sk, skb, ssk->sk_protocol); } static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From 34b98f223bb21673f4cab2f5079a763c34a67946 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:32 -0700 Subject: [PATCH] posix message queues: send notifications via netlink From: Manfred Spraul SIGEV_THREAD means that a given callback should be called in the context on a new thread. This must be done by the C library. The kernel must deliver a notice of the event to the C library when the callback should be called. This patch switches to a new, simpler interface: User space creates a socket with socket(PF_NETLINK, SOCK_RAW,0) and passes the fd to the mq_notify call together with a cookie. When the mq_notify() condition is satisfied, the kernel "writes" the cookie to the socket. User space then reads the cookie and calls the appropriate callback. --- include/linux/mqueue.h | 16 ++++ ipc/mqueue.c | 254 +++++++++++++++++++++---------------------------- 2 files changed, 123 insertions(+), 147 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index 535fe4b2f14b..fdab3b8ee242 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -30,8 +30,24 @@ struct mq_attr { long __reserved[4]; /* ignored for input, zeroed for output */ }; +/* + * SIGEV_THREAD implementation: + * SIGEV_THREAD must be implemented in user space. If SIGEV_THREAD is passed + * to mq_notify, then + * - sigev_signo must be the file descriptor of an AF_NETLINK socket. It's not + * necessary that the socket is bound. + * - sigev_value.sival_ptr must point to a cookie that is NOTIFY_COOKIE_LEN + * bytes long. + * If the notification is triggered, then the cookie is sent to the netlink + * socket. The last byte of the cookie is replaced with the NOTIFY_?? codes: + * NOTIFY_WOKENUP if the notification got triggered, NOTIFY_REMOVED if it was + * removed, either due to a close() on the message queue fd or due to a + * mq_notify() that removed the notification. + */ #define NOTIFY_NONE 0 #define NOTIFY_WOKENUP 1 #define NOTIFY_REMOVED 2 +#define NOTIFY_COOKIE_LEN 32 + #endif diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f0d78fefc28b..f81441d63564 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "util.h" #define MQUEUE_MAGIC 0x19800202 @@ -33,9 +36,6 @@ #define STATE_PENDING 1 #define STATE_READY 2 -#define NP_NONE ((void*)NOTIFY_NONE) -#define NP_WOKENUP ((void*)NOTIFY_WOKENUP) -#define NP_REMOVED ((void*)NOTIFY_REMOVED) /* used by sysctl */ #define FS_MQUEUE 1 #define CTL_QUEUESMAX 2 @@ -48,6 +48,8 @@ #define HARD_MSGMAX (131072/sizeof(void*)) #define DFLT_MSGSIZEMAX 16384 /* max message size */ +#define NOTIFY_COOKIE_LEN 32 + struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; @@ -56,25 +58,26 @@ struct ext_wait_queue { /* queue of sleeping tasks */ }; struct mqueue_inode_info { - struct mq_attr attr; + spinlock_t lock; + struct inode vfs_inode; + wait_queue_head_t wait_q; + struct msg_msg **messages; + struct mq_attr attr; - pid_t notify_owner; /* != 0 means notification registered */ - struct sigevent notify; - struct file *notify_filp; + struct sigevent notify; /* notify.sigev_notify == SIGEV_NONE means */ + pid_t notify_owner; /* no notification registered */ + struct sock *notify_sock; + struct sk_buff *notify_cookie; /* for tasks waiting for free space and messages, respectively */ struct ext_wait_queue e_wait_q[2]; - wait_queue_head_t wait_q; unsigned long qsize; /* size of queue in memory (sum of all msgs) */ - spinlock_t lock; - struct inode vfs_inode; }; static struct inode_operations mqueue_dir_inode_operations; static struct file_operations mqueue_file_operations; -static struct file_operations mqueue_notify_fops; static struct super_operations mqueue_super_ops; static void remove_notification(struct mqueue_inode_info *info); @@ -119,7 +122,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode) init_waitqueue_head(&info->wait_q); INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; info->qsize = 0; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = DFLT_MSGMAX; @@ -283,10 +286,11 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, snprintf(buffer, sizeof(buffer), "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", info->qsize, - info->notify_owner ? info->notify.sigev_notify : SIGEV_NONE, - (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL ) ? + info->notify.sigev_notify, + (info->notify.sigev_notify == SIGEV_SIGNAL ) ? info->notify.sigev_signo : 0, - info->notify_owner); + (info->notify.sigev_notify != SIGEV_NONE) ? + info->notify_owner : 0); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; slen = strlen(buffer)+1; @@ -299,7 +303,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, count = slen - o; if (copy_to_user(u_data, buffer + o, count)) - return -EFAULT; + return -EFAULT; *off = o + count; filp->f_dentry->d_inode->i_atime = filp->f_dentry->d_inode->i_ctime = CURRENT_TIME; @@ -311,7 +315,8 @@ static int mqueue_flush_file(struct file *filp) struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); spin_lock(&info->lock); - if (current->tgid == info->notify_owner) + if (info->notify.sigev_notify != SIGEV_NONE && + current->tgid == info->notify_owner) remove_notification(info); spin_unlock(&info->lock); @@ -435,6 +440,11 @@ static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) return info->messages[info->attr.mq_curmsgs]; } +static inline void set_cookie(struct sk_buff *skb, char code) +{ + ((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code; +} + /* * The next function is only to split too long sys_mq_timedsend */ @@ -445,7 +455,8 @@ static void __do_notify(struct mqueue_inode_info *info) * waiting synchronously for message AND state of queue changed from * empty to not empty. Here we are sure that no one is waiting * synchronously. */ - if (info->notify_owner && info->attr.mq_curmsgs == 1) { + if (info->notify.sigev_notify != SIGEV_NONE && + info->attr.mq_curmsgs == 1) { /* sends signal */ if (info->notify.sigev_notify == SIGEV_SIGNAL) { struct siginfo sig_i; @@ -460,10 +471,12 @@ static void __do_notify(struct mqueue_inode_info *info) kill_proc_info(info->notify.sigev_signo, &sig_i, info->notify_owner); } else if (info->notify.sigev_notify == SIGEV_THREAD) { - info->notify_filp->private_data = (void*)NP_WOKENUP; + set_cookie(info->notify_cookie, NOTIFY_WOKENUP); + netlink_sendskb(info->notify_sock, + info->notify_cookie, 0); } /* after notification unregisters process */ - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; } wake_up(&info->wait_q); } @@ -499,90 +512,13 @@ static long prepare_timeout(const struct timespec __user *u_arg) return timeout; } -/* - * File descriptor based notification, intended to be used to implement - * SIGEV_THREAD: - * SIGEV_THREAD means that a notification function should be called in the - * context of a new thread. The kernel can't do that. Therefore mq_notify - * calls with SIGEV_THREAD return a new file descriptor. A user space helper - * must create a new thread and then read from the given file descriptor. - * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must - * call the notification function. If it's NOTIFY_REMOVED, then the - * notification was removed. The file descriptor supports poll, thus one - * supervisor thread can manage multiple message queue notifications. - * - * The implementation must support multiple outstanding notifications: - * It's possible that a new notification is added and signaled before user - * space calls mqueue_notify_read for the previous notification. - * Therefore the notification state is stored in the private_data field of - * the file descriptor. - */ -static unsigned int mqueue_notify_poll(struct file *filp, - struct poll_table_struct *poll_tab) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - int retval; - - poll_wait(filp, &info->wait_q, poll_tab); - - if (filp->private_data == NP_NONE) - retval = 0; - else - retval = POLLIN | POLLRDNORM; - return retval; -} - -static ssize_t mqueue_notify_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - char result; - - if (!count) - return 0; - if (*ppos != 0) - return 0; - spin_lock(&info->lock); - while (filp->private_data == NP_NONE) { - DEFINE_WAIT(wait); - if (filp->f_flags & O_NONBLOCK) { - spin_unlock(&info->lock); - return -EAGAIN; - } - prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE); - spin_unlock(&info->lock); - schedule(); - finish_wait(&info->wait_q, &wait); - spin_lock(&info->lock); - } - spin_unlock(&info->lock); - result = (char)(unsigned long)filp->private_data; - if (put_user(result, buf)) - return -EFAULT; - *ppos = 1; - return 1; -} - -static int mqueue_notify_release(struct inode *inode, struct file *filp) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - - spin_lock(&info->lock); - if (info->notify_owner && info->notify_filp == filp) - info->notify_owner = 0; - filp->private_data = NP_REMOVED; - spin_unlock(&info->lock); - - return 0; -} - static void remove_notification(struct mqueue_inode_info *info) { if (info->notify.sigev_notify == SIGEV_THREAD) { - info->notify_filp->private_data = NP_REMOVED; - wake_up(&info->wait_q); + set_cookie(info->notify_cookie, NOTIFY_REMOVED); + netlink_sendskb(info->notify_sock, info->notify_cookie, 0); } - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; } /* @@ -780,7 +716,8 @@ out_unlock: */ /* pipelined_send() - send a message directly to the task waiting in - * sys_mq_timedreceive() (without inserting message into a queue). */ + * sys_mq_timedreceive() (without inserting message into a queue). + */ static inline void pipelined_send(struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) @@ -978,12 +915,16 @@ out: asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) { - int ret, fd; - struct file *filp, *nfilp; + int ret; + struct file *filp; + struct sock *sock; struct inode *inode; struct sigevent notification; struct mqueue_inode_info *info; + struct sk_buff *nc; + nc = NULL; + sock = NULL; if (u_notification == NULL) { notification.sigev_notify = SIGEV_NONE; } else { @@ -1000,6 +941,44 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, notification.sigev_signo > _NSIG)) { return -EINVAL; } + if (notification.sigev_notify == SIGEV_THREAD) { + /* create the notify skb */ + nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); + ret = -ENOMEM; + if (!nc) + goto out; + ret = -EFAULT; + if (copy_from_user(nc->data, + notification.sigev_value.sival_ptr, + NOTIFY_COOKIE_LEN)) { + goto out; + } + + /* TODO: add a header? */ + skb_put(nc, NOTIFY_COOKIE_LEN); + /* and attach it to the socket */ +retry: + filp = fget(notification.sigev_signo); + ret = -EBADF; + if (!filp) + goto out; + sock = netlink_getsockbyfilp(filp); + fput(filp); + if (IS_ERR(sock)) { + ret = PTR_ERR(sock); + sock = NULL; + goto out; + } + + ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT); + if (ret == 1) + goto retry; + if (ret) { + sock = NULL; + nc = NULL; + goto out; + } + } } ret = -EBADF; @@ -1013,47 +992,33 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, info = MQUEUE_I(inode); ret = 0; - if (notification.sigev_notify == SIGEV_THREAD) { - ret = get_unused_fd(); - if (ret < 0) - goto out_fput; - fd = ret; - nfilp = get_empty_filp(); - if (!nfilp) { - ret = -ENFILE; - goto out_dropfd; - } - nfilp->private_data = NP_NONE; - nfilp->f_op = &mqueue_notify_fops; - nfilp->f_vfsmnt = mntget(mqueue_mnt); - nfilp->f_dentry = dget(filp->f_dentry); - nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping; - nfilp->f_flags = O_RDONLY; - nfilp->f_mode = FMODE_READ; - } else { - nfilp = NULL; - fd = -1; - } - spin_lock(&info->lock); - - if (notification.sigev_notify == SIGEV_NONE) { - if (info->notify_owner == current->tgid) { + switch (notification.sigev_notify) { + case SIGEV_NONE: + if (info->notify.sigev_notify != SIGEV_NONE && + info->notify_owner == current->tgid) { remove_notification(info); inode->i_atime = inode->i_ctime = CURRENT_TIME; } - } else if (info->notify_owner) { - ret = -EBUSY; - } else if (notification.sigev_notify == SIGEV_THREAD) { - info->notify_filp = nfilp; - fd_install(fd, nfilp); - ret = fd; - fd = -1; - nfilp = NULL; + break; + case SIGEV_THREAD: + if (info->notify.sigev_notify != SIGEV_NONE) { + ret = -EBUSY; + break; + } + info->notify_sock = sock; + info->notify_cookie = nc; + sock = NULL; + nc = NULL; info->notify.sigev_notify = SIGEV_THREAD; info->notify_owner = current->tgid; inode->i_atime = inode->i_ctime = CURRENT_TIME; - } else { + break; + case SIGEV_SIGNAL: + if (info->notify.sigev_notify != SIGEV_NONE) { + ret = -EBUSY; + break; + } info->notify.sigev_signo = notification.sigev_signo; info->notify.sigev_value = notification.sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; @@ -1061,12 +1026,14 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, inode->i_atime = inode->i_ctime = CURRENT_TIME; } spin_unlock(&info->lock); -out_dropfd: - if (fd != -1) - put_unused_fd(fd); out_fput: fput(filp); out: + if (sock) { + netlink_detachskb(sock, nc); + } else if (nc) { + dev_kfree_skb(nc); + } return ret; } @@ -1135,13 +1102,6 @@ static struct file_operations mqueue_file_operations = { .read = mqueue_read_file, }; -static struct file_operations mqueue_notify_fops = { - .poll = mqueue_notify_poll, - .read = mqueue_notify_read, - .release = mqueue_notify_release, -}; - - static struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .destroy_inode = mqueue_destroy_inode, -- cgit v1.2.3 From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:45 -0700 Subject: [PATCH] compat emulation for posix message queues From: Arnd Bergmann I have tested the code with the open posix test suite and found the same four failures for both 64-bit and compat mode, most tests pass. The patch is against -mc1, but I guess it also applies to the other trees around. What worries me more than mq_attr compatibility is the conversion of struct sigevent, which might turn out really hard when more fields in there are used. AFAICS, the only other part in the kernel ABI is sys_timer_create(), so maybe it's not too late to deprecate the current structure and create a structure that can be used properly for compat syscalls. --- arch/ia64/ia32/ia32_signal.c | 7 +- arch/mips/kernel/signal32.c | 7 +- arch/s390/kernel/compat_signal.c | 5 +- arch/sparc64/kernel/signal32.c | 7 +- arch/x86_64/ia32/ia32_signal.c | 6 +- include/asm-ppc64/ppc32.h | 14 --- include/linux/compat.h | 17 ++++ include/linux/mqueue.h | 4 +- include/linux/posix_types.h | 1 + include/linux/syscalls.h | 1 - include/linux/types.h | 1 + ipc/Makefile | 3 +- ipc/compat_mq.c | 196 +++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 5 + 14 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 ipc/compat_mq.c (limited to 'include/linux') diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c index 8b1374c172b6..bb1e836fb227 100644 --- a/arch/ia64/ia32/ia32_signal.c +++ b/arch/ia64/ia32/ia32_signal.c @@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from) err |= __get_user(to->si_band, &from->si_band); err |= __get_user(to->si_fd, &from->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __get_user(to->si_pid, &from->si_pid); + err |= __get_user(to->si_uid, &from->si_uid); + err |= __get_user(to->si_int, &from->si_int); + break; } } return err; diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 5c1489f4fdc2..c52074f84300 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 44fe6e477e92..373040404a5a 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { switch (from->si_code >> 16) { + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_int, &to->si_int); + /* fallthrough */ case __SI_KILL >> 16: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); @@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) break; default: break; - /* case __SI_RT: This is not generated by the kernel as of now. */ } } return err; diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index cc3019d6dd65..e2f62a666d8c 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); err |= __put_user((long)from->si_addr, &to->si_addr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index bce5fbc5be2c..1a828de6a55d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h index 53865a8c4f8d..7338ea298a19 100644 --- a/include/asm-ppc64/ppc32.h +++ b/include/asm-ppc64/ppc32.h @@ -141,20 +141,6 @@ struct ucontext32 { struct mcontext32 uc_mcontext; }; -typedef struct compat_sigevent { - compat_sigval_t sigev_value; - int sigev_signo; - int sigev_notify; - union { - int _pad[SIGEV_PAD_SIZE]; - int _tid; - struct { - compat_uptr_t _function; - compat_uptr_t _attribute; - } _sigev_thread; - } _sigev_un; -} compat_sigevent_t; - struct ipc_kludge_32 { unsigned int msgp; int msgtyp; diff --git a/include/linux/compat.h b/include/linux/compat.h index 7b82209ab4ab..796204f59bd9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -13,6 +13,7 @@ #include #include +#include #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) @@ -90,6 +91,22 @@ typedef union compat_sigval { compat_uptr_t sival_ptr; } compat_sigval_t; +typedef struct compat_sigevent { + compat_sigval_t sigev_value; + compat_int_t sigev_signo; + compat_int_t sigev_notify; + union { + compat_int_t _pad[SIGEV_PAD_SIZE]; + compat_int_t _tid; + + struct { + compat_uptr_t _function; + compat_uptr_t _attribute; + } _sigev_thread; + } _sigev_un; +} compat_sigevent_t; + + long compat_sys_semctl(int first, int second, int third, void __user *uptr); long compat_sys_msgsnd(int first, int second, int third, void __user *uptr); long compat_sys_msgrcv(int first, int second, int msgtyp, int third, diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index fdab3b8ee242..fc40b774b913 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -18,9 +18,9 @@ #ifndef _LINUX_MQUEUE_H #define _LINUX_MQUEUE_H -#define MQ_PRIO_MAX 32768 +#include -typedef int mqd_t; +#define MQ_PRIO_MAX 32768 struct mq_attr { long mq_flags; /* message queue flags */ diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h index 3ee2ed9de1db..f04c98cf44f3 100644 --- a/include/linux/posix_types.h +++ b/include/linux/posix_types.h @@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int); /* Type of a SYSV IPC key. */ typedef int __kernel_key_t; +typedef int __kernel_mqd_t; #include diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7ee5f67abb5f..89ffe55898f2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,7 +48,6 @@ struct timex; struct timezone; struct tms; struct utimbuf; -typedef int mqd_t; struct mq_attr; #include diff --git a/include/linux/types.h b/include/linux/types.h index 3b407b06b48f..93f5f3653561 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -31,6 +31,7 @@ typedef __kernel_key_t key_t; typedef __kernel_suseconds_t suseconds_t; typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; +typedef __kernel_mqd_t mqd_t; #ifdef __KERNEL__ typedef __kernel_uid32_t uid_t; diff --git a/ipc/Makefile b/ipc/Makefile index 913790207d85..0a6d626cd794 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o -obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o +obj_mq-$(CONFIG_COMPAT) += compat_mq.o +obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c new file mode 100644 index 000000000000..1520df89c424 --- /dev/null +++ b/ipc/compat_mq.c @@ -0,0 +1,196 @@ +/* + * ipc/compat_mq.c + * 32 bit emulation for POSIX message queue system calls + * + * Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Author: Arnd Bergmann + */ + +#include +#include +#include +#include +#include + +#include + +struct compat_mq_attr { + compat_long_t mq_flags; /* message queue flags */ + compat_long_t mq_maxmsg; /* maximum number of messages */ + compat_long_t mq_msgsize; /* maximum message size */ + compat_long_t mq_curmsgs; /* number of messages currently queued */ + compat_long_t __reserved[4]; /* ignored for input, zeroed for output */ +}; + +static inline int get_compat_mq_attr(struct mq_attr *attr, + const struct compat_mq_attr __user *uattr) +{ + if (verify_area(VERIFY_READ, uattr, sizeof *uattr)) + return -EFAULT; + + return __get_user(attr->mq_flags, &uattr->mq_flags) + | __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __get_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +static inline int put_compat_mq_attr(const struct mq_attr *attr, + struct compat_mq_attr __user *uattr) +{ + if (clear_user(uattr, sizeof *uattr)) + return -EFAULT; + + return __put_user(attr->mq_flags, &uattr->mq_flags) + | __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __put_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +asmlinkage long compat_sys_mq_open(const char __user *u_name, + int oflag, compat_mode_t mode, + struct compat_mq_attr __user *u_attr) +{ + struct mq_attr attr; + mm_segment_t oldfs; + char *name; + long ret; + + if ((oflag & O_CREAT) == 0 || !u_attr) + return sys_mq_open(u_name, oflag, mode, 0); + + if (get_compat_mq_attr(&attr, u_attr)) + return -EFAULT; + + name = getname(u_name); + if (IS_ERR(name)) + return PTR_ERR(name); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_open(name, oflag, mode, &attr); + set_fs(oldfs); + + putname(name); + return ret; +} + +static struct timespec __user *compat_prepare_timeout( + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec ts, __user *u_ts; + + if (!u_abs_timeout) + return 0; + + u_ts = compat_alloc_user_space(sizeof(*u_ts)); + if (get_compat_timespec(&ts, u_abs_timeout) + || copy_to_user(u_ts, &ts, sizeof(*u_ts))) + return ERR_PTR(-EFAULT); + + return u_ts; +} + +asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, + const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec __user *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len, + msg_prio, u_ts); +} + +asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, + char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len, + u_msg_prio, u_ts); +} + +static int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event) +{ + if (verify_area(VERIFY_READ, u_event, sizeof(*u_event))) + return -EFAULT; + + return __get_user(event->sigev_value.sival_int, + &u_event->sigev_value.sival_int) + | __get_user(event->sigev_signo, &u_event->sigev_signo) + | __get_user(event->sigev_notify, &u_event->sigev_notify) + | __get_user(event->sigev_notify_thread_id, + &u_event->sigev_notify_thread_id); +} + +asmlinkage long compat_sys_mq_notify(mqd_t mqdes, + const struct compat_sigevent __user *u_notification) +{ + mm_segment_t oldfs; + struct sigevent notification; + char cookie[NOTIFY_COOKIE_LEN]; + compat_uptr_t u_cookie; + long ret; + + if (!u_notification) + return sys_mq_notify(mqdes, 0); + + if (get_compat_sigevent(¬ification, u_notification)) + return -EFAULT; + + if (notification.sigev_notify == SIGEV_THREAD) { + u_cookie = (compat_uptr_t)notification.sigev_value.sival_int; + if (copy_from_user(cookie, compat_ptr(u_cookie), + NOTIFY_COOKIE_LEN)) { + return -EFAULT; + } + notification.sigev_value.sival_ptr = cookie; + } + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_notify(mqdes, ¬ification); + set_fs(oldfs); + + return ret; +} + +asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, + const struct compat_mq_attr __user *u_mqstat, + struct compat_mq_attr __user *u_omqstat) +{ + struct mq_attr mqstat, omqstat; + struct mq_attr *p_mqstat = 0, *p_omqstat = 0; + mm_segment_t oldfs; + long ret; + + if (u_mqstat) { + p_mqstat = &mqstat; + if (get_compat_mq_attr(p_mqstat, u_mqstat)) + return -EFAULT; + } + + if (u_omqstat) + p_omqstat = &omqstat; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat); + set_fs(oldfs); + + if (ret) + return ret; + + return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0; +} diff --git a/kernel/sys.c b/kernel/sys.c index 7d1bf5c57aca..81f9e02f2071 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend) cond_syscall(sys_mq_timedreceive) cond_syscall(sys_mq_notify) cond_syscall(sys_mq_getsetattr) +cond_syscall(compat_sys_mq_open) +cond_syscall(compat_sys_mq_timedsend) +cond_syscall(compat_sys_mq_timedreceive) +cond_syscall(compat_sys_mq_notify) +cond_syscall(compat_sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From 0ab2d6681c4e8502990523d46d928f37b764d52d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:56:34 -0700 Subject: [PATCH] IPMI driver updates From: Corey Minyard - Add support for messaging through an IPMI LAN interface, which is required for some system software that already exists on other IPMI drivers. It also does some renaming and a lot of little cleanups. - Add the "System Interface" driver. The previous driver for system interfaces only supported the KCS interface, this driver supports all system interfaces defined in the IPMI standard. It also does a much better job of handling ACPI and SMBIOS tables for detecting IPMI system interfaces. --- Documentation/IPMI.txt | 218 +++- drivers/char/ipmi/Kconfig | 8 +- drivers/char/ipmi/Makefile | 9 +- drivers/char/ipmi/ipmi_bt_sm.c | 513 +++++++++ drivers/char/ipmi/ipmi_devintf.c | 197 ++-- drivers/char/ipmi/ipmi_kcs_intf.c | 1305 ---------------------- drivers/char/ipmi/ipmi_kcs_sm.c | 156 +-- drivers/char/ipmi/ipmi_kcs_sm.h | 70 -- drivers/char/ipmi/ipmi_msghandler.c | 1292 +++++++++++++++++++--- drivers/char/ipmi/ipmi_si_intf.c | 2052 +++++++++++++++++++++++++++++++++++ drivers/char/ipmi/ipmi_si_sm.h | 117 ++ drivers/char/ipmi/ipmi_smic_sm.c | 599 ++++++++++ drivers/char/ipmi/ipmi_watchdog.c | 122 +-- include/linux/ipmi.h | 131 ++- include/linux/ipmi_msgdefs.h | 36 +- include/linux/ipmi_smi.h | 14 +- 16 files changed, 5013 insertions(+), 1826 deletions(-) create mode 100644 drivers/char/ipmi/ipmi_bt_sm.c delete mode 100644 drivers/char/ipmi/ipmi_kcs_intf.c delete mode 100644 drivers/char/ipmi/ipmi_kcs_sm.h create mode 100644 drivers/char/ipmi/ipmi_si_intf.c create mode 100644 drivers/char/ipmi/ipmi_si_sm.h create mode 100644 drivers/char/ipmi/ipmi_smic_sm.c (limited to 'include/linux') diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index 825e83cb4acc..ec8a6fa2c34b 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt @@ -22,6 +22,58 @@ are not familiar with IPMI itself, see the web site at http://www.intel.com/design/servers/ipmi/index.htm. IPMI is a big subject and I can't cover it all here! +Configuration +------------- + +The LinuxIPMI driver is modular, which means you have to pick several +things to have it work right depending on your hardware. Most of +these are available in the 'Character Devices' menu. + +No matter what, you must pick 'IPMI top-level message handler' to use +IPMI. What you do beyond that depends on your needs and hardware. + +The message handler does not provide any user-level interfaces. +Kernel code (like the watchdog) can still use it. If you need access +from userland, you need to select 'Device interface for IPMI' if you +want access through a device driver. Another interface is also +available, you may select 'IPMI sockets' in the 'Networking Support' +main menu. This provides a socket interface to IPMI. You may select +both of these at the same time, they will both work together. + +The driver interface depends on your hardware. If you have a board +with a standard interface (These will generally be either "KCS", +"SMIC", or "BT", consult your hardware manual), choose the 'IPMI SI +handler' option. A driver also exists for direct I2C access to the +IPMI management controller. Some boards support this, but it is +unknown if it will work on every board. For this, choose 'IPMI SMBus +handler', but be ready to try to do some figuring to see if it will +work. + +There is also a KCS-only driver interface supplied, but it is +depracated in favor of the SI interface. + +You should generally enable ACPI on your system, as systems with IPMI +should have ACPI tables describing them. + +If you have a standard interface and the board manufacturer has done +their job correctly, the IPMI controller should be automatically +detect (via ACPI or SMBIOS tables) and should just work. Sadly, many +boards do not have this information. The driver attempts standard +defaults, but they may not work. If you fall into this situation, you +need to read the section below named 'The SI Driver' on how to +hand-configure your system. + +IPMI defines a standard watchdog timer. You can enable this with the +'IPMI Watchdog Timer' config option. If you compile the driver into +the kernel, then via a kernel command-line option you can have the +watchdog timer start as soon as it intitializes. It also have a lot +of other options, see the 'Watchdog' section below for more details. +Note that you can also have the watchdog continue to run if it is +closed (by default it is disabled on close). Go into the 'Watchdog +Cards' menu, enable 'Watchdog Timer Support', and enable the option +'Disable watchdog shutdown on close'. + + Basic Design ------------ @@ -41,18 +93,30 @@ ipmi_devintf - This provides a userland IOCTL interface for the IPMI driver, each open file for this device ties in to the message handler as an IPMI user. -ipmi_kcs_drv - A driver for the KCS SMI. Most system have a KCS -interface for IPMI. +ipmi_si - A driver for various system interfaces. This supports +KCS, SMIC, and may support BT in the future. Unless you have your own +custom interface, you probably need to use this. + +ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the +I2C kernel driver's SMBus interfaces to send and receive IPMI messages +over the SMBus. + +af_ipmi - A network socket interface to IPMI. This doesn't take up +a character device in your system. +Note that the KCS-only interface ahs been removed. Much documentation for the interface is in the include files. The IPMI include files are: -ipmi.h - Contains the user interface and IOCTL interface for IPMI. +net/af_ipmi.h - Contains the socket interface. -ipmi_smi.h - Contains the interface for SMI drivers to use. +linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI. -ipmi_msgdefs.h - General definitions for base IPMI messaging. +linux/ipmi_smi.h - Contains the interface for system management interfaces +(things that interface to IPMI controllers) to use. + +linux/ipmi_msgdefs.h - General definitions for base IPMI messaging. Addressing @@ -260,70 +324,131 @@ they register with the message handler. They are generally assigned in the order they register, although if an SMI unregisters and then another one registers, all bets are off. -The ipmi_smi.h defines the interface for SMIs, see that for more -details. +The ipmi_smi.h defines the interface for management interfaces, see +that for more details. -The KCS Driver --------------- +The SI Driver +------------- -The KCS driver allows up to 4 KCS interfaces to be configured in the -system. By default, the driver will register one KCS interface at the -spec-specified I/O port 0xca2 without interrupts. You can change this -at module load time (for a module) with: +The SI driver allows up to 4 KCS or SMIC interfaces to be configured +in the system. By default, scan the ACPI tables for interfaces, and +if it doesn't find any the driver will attempt to register one KCS +interface at the spec-specified I/O port 0xca2 without interrupts. +You can change this at module load time (for a module) with: + + modprobe ipmi_si.o type=,.... + ports=,... addrs=,... + irqs=,... trydefaults=[0|1] + +Each of these except si_trydefaults is a list, the first item for the +first interface, second item for the second interface, etc. - insmod ipmi_kcs_drv.o kcs_ports=,... kcs_addrs=, - kcs_irqs=,... kcs_trydefaults=[0|1] +The si_type may be either "kcs", "smic", or "bt". If you leave it blank, it +defaults to "kcs". -The KCS driver supports two types of interfaces, ports (for I/O port -based KCS interfaces) and memory addresses (for KCS interfaces in -memory). The driver will support both of them simultaneously, setting -the port to zero (or just not specifying it) will allow the memory -address to be used. The port will override the memory address if it -is specified and non-zero. kcs_trydefaults sets whether the standard -IPMI interface at 0xca2 and any interfaces specified by ACPE are -tried. By default, the driver tries it, set this value to zero to -turn this off. +If you specify si_addrs as non-zero for an interface, the driver will +use the memory address given as the address of the device. This +overrides si_ports. + +If you specify si_ports as non-zero for an interface, the driver will +use the I/O port given as the device address. + +If you specify si_irqs as non-zero for an interface, the driver will +attempt to use the given interrupt for the device. + +si_trydefaults sets whether the standard IPMI interface at 0xca2 and +any interfaces specified by ACPE are tried. By default, the driver +tries it, set this value to zero to turn this off. When compiled into the kernel, the addresses can be specified on the kernel command line as: - ipmi_kcs=:,:....,[nodefault] + ipmi_si.type=,... + ipmi_si.ports=,... ipmi_si.addrs=,... + ipmi_si.irqs=,... ipmi_si.trydefaults=[0|1] -The values is either "p" or "m" for port or memory -addresses. So for instance, a KCS interface at port 0xca2 using -interrupt 9 and a memory interface at address 0xf9827341 with no -interrupt would be specified "ipmi_kcs=p0xca2:9,m0xf9827341". -If you specify zero for in irq or don't specify it, the driver will -run polled unless the software can detect the interrupt to use in the -ACPI tables. +It works the same as the module parameters of the same names. -By default, the driver will attempt to detect a KCS device at the -spec-specified 0xca2 address and any address specified by ACPI. If -you want to turn this off, use the "nodefault" option. +By default, the driver will attempt to detect any device specified by +ACPI, and if none of those then a KCS device at the spec-specified +0xca2. If you want to turn this off, set the "trydefaults" option to +false. If you have high-res timers compiled into the kernel, the driver will use them to provide much better performance. Note that if you do not have high-res timers enabled in the kernel and you don't have interrupts enabled, the driver will run VERY slowly. Don't blame me, -the KCS interface sucks. +these interfaces suck. + + +The SMBus Driver +---------------- + +The SMBus driver allows up to 4 SMBus devices to be configured in the +system. By default, the driver will register any SMBus interfaces it finds +in the I2C address range of 0x20 to 0x4f on any adapter. You can change this +at module load time (for a module) with: + + modprobe ipmi_smb.o + addr=,[,,[,...]] + dbg=,... + [defaultprobe=0] [dbg_probe=1] + +The addresses are specified in pairs, the first is the adapter ID and the +second is the I2C address on that adapter. + +The debug flags are bit flags for each BMC found, they are: +IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8 + +Setting smb_defaultprobe to zero disabled the default probing of SMBus +interfaces at address range 0x20 to 0x4f. This means that only the +BMCs specified on the smb_addr line will be detected. + +Setting smb_dbg_probe to 1 will enable debugging of the probing and +detection process for BMCs on the SMBusses. + +Discovering the IPMI compilant BMC on the SMBus can cause devices +on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI +message as a block write to the I2C bus and waits for a response. +This action can be detrimental to some I2C devices. It is highly recommended +that the known I2c address be given to the SMBus driver in the smb_addr +parameter. The default adrress range will not be used when a smb_addr +parameter is provided. + +When compiled into the kernel, the addresses can be specified on the +kernel command line as: + + ipmb_smb.addr=,[,,[,...]] + ipmi_smb.dbg=,... + ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1 + +These are the same options as on the module command line. + +Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT +is enabled along with this, so the I2C driver knows to run to +completion during sending a panic event. Other Pieces ------------ Watchdog +-------- A watchdog timer is provided that implements the Linux-standard watchdog timer interface. It has three module parameters that can be used to control it: - insmod ipmi_watchdog timeout= pretimeout= action= - preaction= preop= + modprobe ipmi_watchdog timeout= pretimeout= action= + preaction= preop= start_now=x The timeout is the number of seconds to the action, and the pretimeout is the amount of seconds before the reset that the pre-timeout panic will -occur (if pretimeout is zero, then pretimeout will not be enabled). +occur (if pretimeout is zero, then pretimeout will not be enabled). Note +that the pretimeout is the time before the final timeout. So if the +timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout +will occur in 40 second (10 seconds before the timeout). The action may be "reset", "power_cycle", or "power_off", and specifies what to do when the timer times out, and defaults to @@ -344,16 +469,19 @@ When preop is set to "preop_give_data", one byte comes ready to read on the device when the pretimeout occurs. Select and fasync work on the device, as well. +If start_now is set to 1, the watchdog timer will start running as +soon as the driver is loaded. + When compiled into the kernel, the kernel command line is available for configuring the watchdog: - ipmi_wdog=[,[,